# Annotate complexes


In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Add the project root to the Python path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

# Import AAAIM functions
from core import annotate_model, curate_model
from core.model_info import (
    find_species_with_chebi_annotations,
    find_species_with_uniprot_annotations,
    extract_model_info
)
from core.database_search import force_clear_chromadb, get_species_recommendations_rag
from utils.evaluation import (
    evaluate_single_model,
    evaluate_models_in_folder,
    print_evaluation_results,
    compare_results,
    process_saved_llm_responses,
    analyze_bqbiol_qualifier_statistics,
    filter_qualifiers_in_results
)

## Annotation distribution across bqbiol qualifiers and ontologies

Count the number of species that contain each ontology term for each bqbiol qualifier.

In [3]:
stats = analyze_bqbiol_qualifier_statistics(
    model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels", 
    output_file = 'BioModels_bqbiol_qualifier_statistics_all.csv',
    verbose = True)

Analyzing 1063 models for bqbiol qualifier statistics...
Processing BIOMD0000000634.xml...
Processing BIOMD0000000152.xml...
Processing BIOMD0000000146.xml...
Processing BIOMD0000000620.xml...
Processing BIOMD0000000608.xml...
Processing BIOMD0000000191.xml...
Processing BIOMD0000000185.xml...
Processing BIOMD0000000813.xml...
Processing BIOMD0000000807.xml...
Processing BIOMD0000000350.xml...
Processing BIOMD0000000436.xml...
Processing BIOMD0000000422.xml...
Processing BIOMD0000000344.xml...
Processing BIOMD0000001072.xml...
Processing BIOMD0000000378.xml...
Processing BIOMD0000000393.xml...
Processing BIOMD0000000387.xml...
Processing BIOMD0000000224.xml...
Processing BIOMD0000000542.xml...
Processing BIOMD0000000556.xml...
Processing BIOMD0000000230.xml...
Processing BIOMD0000000218.xml...
Processing BIOMD0000000581.xml...
Processing BIOMD0000000595.xml...
Processing BIOMD0000000740.xml...
Processing BIOMD0000000998.xml...
Processing BIOMD0000000026.xml...
Processing BIOMD000000003

In [2]:
stats = analyze_bqbiol_qualifier_statistics(
    model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioDivine", 
    output_file = 'BioDivine_bqbiol_qualifier_statistics_all.csv',
    verbose = True)

Analyzing 190 models for bqbiol qualifier statistics...
Processing 054_PC12-CELL-DIFFERENTIATION_source.sbml...
Processing 038_SKBR3-BREAST-CELL-LINE-LONG-TERM_source.sbml...
Processing 073_LYMPHOID-AND-MYELOID-CELL-SPECIFICATION_source.sbml...
Processing 126_ORF3A_source.sbml...
Processing 043_BORDETELLA-BRONCHISEPTICA_source.sbml...
Processing 161_DIFFERENTIATION-OF-MONOCYTES_source.sbml...
Processing 078_IMMUNE-SYSTEM_source.sbml...
Processing 009_YEAST-APOPTOSIS_source.sbml...
Processing 072_LYMPHOPOIESIS-REGULATORY-NETWORK_source.sbml...
Processing 065_TUMOUR-CELL-INVASION-AND-MIGRATION_source.sbml...
Processing 198_PAIR-RULE-MODULE_source.sbml...
Processing 090_MAPK-REDUCED-2_source.sbml...
Processing 179_MICROENVIRONMENT-CONTROL_source.sbml...
Processing 029_TOLL-PATHWAY-OF-DROSOPHILA_source.sbml...
Processing 010_CARDIAC-DEVELOPMENT_source.sbml...
Processing 172_DROSOPHILA-GAP-D_source.sbml...
Processing 193_SENESCENCE-G1S-CHECKPOINT_source.sbml...
Processing 123_NSP4-NSP6_sour

## Filter results

In [7]:
filtered = filter_qualifiers_in_results(
    results_csv = 'results/biomd_chebi_rag_llama-4_top3_prompt_adjusted.csv', 
    bqbiol_qualifiers = ['hasPart'],
    model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels",
    output_csv = "results/biomd_chebi_rag_llama-4_top3_prompt_adjusted_hasPart.csv",
    entity_type = "chemical",
    database = "chebi")

INFO Filtered results saved to: results/biomd_chebi_rag_llama-4_top3_prompt_adjusted_hasPart.csv


In [8]:
print_evaluation_results("results/biomd_chebi_rag_llama-4_top3_prompt_adjusted_hasPart.csv",ref_results_csv = None)

Showing all results
Number of models assessed: 95
Number of models with predictions: 86
Number of annotations evaluated: 1043
Average accuracy (per model): 0.41
Ave. recall (formula): 0.38
Ave. precision (formula): 0.31
Ave. recall (exact): 0.32
Ave. precision (exact): 0.13
Average accuracy (per species): 0.26
Ave. recall (formula, per species): 0.24
Ave. precision (formula, per species): 0.18
Ave. recall (exact, per species): 0.21
Ave. precision (exact, per species): 0.08
Ave. total time (per model): 11.58
Ave. total time (per element, per model): 1.05
Ave. LLM time (per model): 11.29
Ave. LLM time (per element, per model): 1.03
Average number of predictions per species: 2.60


In [4]:
print_evaluation_results("results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted_hasPart.csv",ref_results_csv = None)

Showing all results
Number of models assessed: 95
Number of models with predictions: 88
Number of annotations evaluated: 1043
Average accuracy (per model): 0.38
Ave. recall (formula): 0.36
Ave. precision (formula): 0.27
Ave. recall (exact): 0.31
Ave. precision (exact): 0.12
Average accuracy (per species): 0.19
Ave. recall (formula, per species): 0.18
Ave. precision (formula, per species): 0.12
Ave. recall (exact, per species): 0.14
Ave. precision (exact, per species): 0.06
Ave. total time (per model): 5.09
Ave. total time (per element, per model): 0.46
Ave. LLM time (per model): 4.74
Ave. LLM time (per element, per model): 0.43
Average number of predictions per species: 2.47


In [5]:
filtered = filter_qualifiers_in_results(
    results_csv = "results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted.csv", 
    bqbiol_qualifiers = ['is','isVersionOf'],
    model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels",
    output_csv = "results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted_is+isVersionOf.csv",
    entity_type = "chemical",
    database = "chebi")

INFO Filtered results saved to: results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted_is+isVersionOf.csv


In [6]:
print_evaluation_results("results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted_is+isVersionOf.csv",ref_results_csv = None)

Showing all results
Number of models assessed: 317
Number of models with predictions: 313
Number of annotations evaluated: 9654
Average accuracy (per model): 0.86
Ave. recall (formula): 0.86
Ave. precision (formula): 0.65
Ave. recall (exact): 0.73
Ave. precision (exact): 0.25
Average accuracy (per species): 0.40
Ave. recall (formula, per species): 0.40
Ave. precision (formula, per species): 0.29
Ave. recall (exact, per species): 0.29
Ave. precision (exact, per species): 0.12
Ave. total time (per model): 5.03
Ave. total time (per element, per model): 0.17
Ave. LLM time (per model): 4.51
Ave. LLM time (per element, per model): 0.15
Average number of predictions per species: 2.87


In [8]:
print_evaluation_results("results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted_filter_is.csv",ref_results_csv = None)

Showing all results
Number of models assessed: 190
Number of models with predictions: 187
Number of annotations evaluated: 3480
Average accuracy (per model): 0.88
Ave. recall (formula): 0.87
Ave. precision (formula): 0.66
Ave. recall (exact): 0.76
Ave. precision (exact): 0.26
Average accuracy (per species): 0.59
Ave. recall (formula, per species): 0.59
Ave. precision (formula, per species): 0.43
Ave. recall (exact, per species): 0.49
Ave. precision (exact, per species): 0.17
Ave. total time (per model): 5.29
Ave. total time (per element, per model): 0.29
Ave. LLM time (per model): 4.84
Ave. LLM time (per element, per model): 0.26
Average number of predictions per species: 2.90


In [None]:
filtered = filter_qualifiers_in_results(
    results_csv = "results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted.csv", 
    bqbiol_qualifiers = ['is'],
    model_dir = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels",
    output_csv = "results/biomd_chebi_rag_meta-llama_top3_prompt_adjusted_filter_is.csv",
    entity_type = "chemical",
    database = "chebi")

In [10]:
print("Results for all qualifiers")
print_evaluation_results("results/biomd_chebi_rag_llama-4_top3_prompt_adjusted.csv",ref_results_csv = None)
print("Results for is qualifier")
print_evaluation_results("results/biomd_chebi_rag_llama-4_top3_prompt_adjusted_filter_is.csv",ref_results_csv = None)

Results for all qualifiers
Showing all results
Number of models assessed: 340
Number of models with predictions: 338
Number of annotations evaluated: 10771
Average accuracy (per model): 0.79
Ave. recall (formula): 0.78
Ave. precision (formula): 0.59
Ave. recall (exact): 0.68
Ave. precision (exact): 0.24
Average accuracy (per species): 0.40
Ave. recall (formula, per species): 0.39
Ave. precision (formula, per species): 0.28
Ave. recall (exact, per species): 0.28
Ave. precision (exact, per species): 0.13
Ave. total time (per model): 10.16
Ave. total time (per element, per model): 0.32
Ave. LLM time (per model): 9.77
Ave. LLM time (per element, per model): 0.31
Average number of predictions per species: 2.84
Results for is qualifier
Showing all results
Number of models assessed: 190
Number of models with predictions: 189
Number of annotations evaluated: 3480
Average accuracy (per model): 0.88
Ave. recall (formula): 0.88
Ave. precision (formula): 0.66
Ave. recall (exact): 0.78
Ave. precisi

## Failure reason

In [15]:
excel_file = "/Users/luna/Desktop/CRBM/AMAS_proj/AAAIM/tests/results/biomd_chebi_rag_llama-4_top10_prompt_adjusted_review.xlsx"
df = pd.read_excel(excel_file)
# Count occurrences of each unique value in 'Wrong_reason' column
reason_counts = df['Wrong_reason'].value_counts(dropna=False)
print("Counts of each unique value in 'Wrong_reason':")
print(reason_counts)

Counts of each unique value in 'Wrong_reason':
Wrong_reason
NaN    10721
6.0       29
5.0        6
3.0        5
2.0        4
1.0        3
8.0        2
4.0        1
Name: count, dtype: int64


## Run for complex

## Test functions

In [5]:
test_model_file = "/Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels/BIOMD0000000088.xml"
if os.path.exists(test_model_file):
    print(f"✓ Test model found: {test_model_file}")
else:
    print(f"✗ Test model not found: {test_model_file}")

✓ Test model found: /Users/luna/Desktop/CRBM/AMAS_proj/Models/BioModels/BIOMD0000000088.xml


In [6]:
find_species_with_chebi_annotations(test_model_file,['hasPart'])

{'s187': ['15996'],
 's173': ['16595'],
 's214': ['15996'],
 's231': ['17552'],
 's252': ['15996'],
 's311': ['18035'],
 's436': ['15996'],
 's438': ['17552'],
 's439': ['15996'],
 's440': ['17552'],
 's443': ['15996'],
 's449': ['15996'],
 's456': ['15996'],
 's506': ['29108'],
 's512': ['29108'],
 's513': ['29108'],
 's520': ['29108'],
 's526': ['29108'],
 's539': ['29108'],
 's546': ['29108'],
 's551': ['29108'],
 's564': ['29108', '15996'],
 's565': ['29108', '18035'],
 's566': ['29108'],
 's567': ['29108', '15996'],
 's568': ['29108', '18348'],
 's569': ['29108']}

In [17]:
import re
identifiers_pattern = r'http[s]?://identifiers\.org/([^/:]+)'
text = "http://identifiers.org/GO:0042752"
re.findall(identifiers_pattern, text)

['GO']