In [None]:
import pickle 
from pathlib import Path
import os
import shutil
from Bio import SeqIO
from pathlib import Path
import gzip, shutil, pickle
from Bio import SeqIO
import numpy as np
import re
import pickle
import gzip
import shutil
import numpy as np
from pathlib import Path
from Bio import SeqIO

In [8]:
fasta_dir = Path("genomes_to_annotate_with_PlasmoFP")
fasta_files = list(fasta_dir.glob("*.fasta"))

genomes = { f.name.split('_')[1] for f in fasta_files }

genome_list = sorted(genomes)

print(genomes)       # e.g. {'PbergheiANKA', 'PvivaxSal1', ...}
print(genome_list)   # e.g. ['PbergheiANKA', 'PblacklockiG01', ...]


{'PbergheiANKA', 'PblacklockiG01', 'PmalariaeUG01', 'PvinckeibrucechwattiDA', 'PreichenowiCDC', 'PcynomolgiM', 'PovalecurtisiGH01', 'PovalewallikeriPowCR01', 'Pfalciparum3D7', 'PgaboniG01', 'PknowlesiH', 'PadleriG01', 'Pgallinaceum8A', 'PvivaxSal1', 'Pchabaudichabaudi', 'PinuiSanAntonio1', 'PfragileNilgiri', 'PcoatneyiHackeri', 'Pyoeliiyoelii17XNL2023'}
['PadleriG01', 'PbergheiANKA', 'PblacklockiG01', 'Pchabaudichabaudi', 'PcoatneyiHackeri', 'PcynomolgiM', 'Pfalciparum3D7', 'PfragileNilgiri', 'PgaboniG01', 'Pgallinaceum8A', 'PinuiSanAntonio1', 'PknowlesiH', 'PmalariaeUG01', 'PovalecurtisiGH01', 'PovalewallikeriPowCR01', 'PreichenowiCDC', 'PvinckeibrucechwattiDA', 'PvivaxSal1', 'Pyoeliiyoelii17XNL2023']


In [11]:
print(len(genome_list))

19


In [None]:
fasta_dir = Path("genomes_to_annotate_with_PlasmoFP")

species_to_fasta = {}
for sp in genome_list:
    fn = f"PlasmoDB-68_{sp}_AnnotatedProteins.fasta"
    fp = fasta_dir / fn
    if fp.exists():
        species_to_fasta[sp] = fp
    else:
        matches = list(fasta_dir.glob(f"*{sp}*AnnotatedProteins*.fasta"))
        if len(matches) == 1:
            species_to_fasta[sp] = matches[0]
        elif len(matches) > 1:
            print(f"multiple matches for {sp}: {matches}")
        else:
            print(f"no file found for {sp}")

for sp, fasta_path in species_to_fasta.items():
    print(f"{sp} ‚Üí {fasta_path}")


In [None]:
dest_dir = Path("genomes_to_annotate_with_PlasmoFP/annotated_fasta")
dest_dir.mkdir(parents=True, exist_ok=True)

for species, fasta_path in species_to_fasta.items():
    dest_path = dest_dir / fasta_path.name
    try:
        shutil.copy2(fasta_path, dest_path)
        print(f"{species}: copied to {dest_path}")
    except FileNotFoundError:
        print(f"{species}: source file not found ({fasta_path})")
    except Exception as e:
        print(f"{species}: failed to copy ({e})")


In [None]:
src_dir = Path("genomes_to_annotate_with_PlasmoFP/annotated_fasta")
out_dir = Path("genomes_to_annotate_with_PlasmoFP/1_length_filtered_fastas")
out_dir.mkdir(parents=True, exist_ok=True)

for fasta_fp in src_dir.glob("*.fasta"):
    filtered_records = (
        rec for rec in SeqIO.parse(fasta_fp, "fasta")
        if len(rec.seq) <= 1200
    )
    out_fp = out_dir / (fasta_fp.stem + "_filtered.fasta")
    count = SeqIO.write(filtered_records, out_fp, "fasta")
    print(f"{fasta_fp.name}: wrote {count} records ‚â§1200 aa ‚Üí {out_fp}")


In [None]:
src_dir = Path("genomes_to_annotate_with_PlasmoFP/1_length_filtered_fastas")
out_dir = Path("genomes_to_annotate_with_PlasmoFP/2_multi_step_filtered_fastas")
out_dir.mkdir(exist_ok=True)

keywords = [
    "emp1", "merozoite surface protein", "msp",
    "rifin", "stevor", "fragment",
    "vir", "variant interspersed repeat", "variant interspersed repeats",
    "fam-",    # fam-a/b/c...
    "pir",     # PIR/CIR
    "cir",
    "yir",     # YIR family
    "surf",    # SURFIN
]

def filter_counts(fasta_fp):
    counts = {
        "total": 0,
        "step1_non_pseudo": 0,
        "step2_iso1": 0,
        "step4_no_keyword": 0,
        "step5_no_api": 0,
    }
    for rec in SeqIO.parse(fasta_fp, "fasta"):
        counts["total"] += 1
        desc = rec.description.lower()

        if "is_pseudo=false" not in desc:
            continue
        counts["step1_non_pseudo"] += 1

        if "transcript=" not in desc:
            continue
        trans_id = desc.split("transcript=")[1].split()[0].lower()
        if not (trans_id.endswith(".1") or trans_id.endswith("_1") or trans_id.endswith("_t1")):
            continue
        counts["step2_iso1"] += 1

        if any(kw in desc for kw in keywords):
            continue
        counts["step4_no_keyword"] += 1

        gene_parts = [p for p in desc.split("|") if p.strip().startswith("gene=")]
        if gene_parts and "api" in gene_parts[0].lower():
            continue
        counts["step5_no_api"] += 1

    return counts

def keep_record(rec):
    desc = rec.description.lower()
    if "is_pseudo=false" not in desc:
        return False
    if "transcript=" not in desc:
        return False
    trans_id = desc.split("transcript=")[1].split()[0].lower()
    if not (trans_id.endswith(".1") or trans_id.endswith("_1") or trans_id.endswith("_t1")):
        return False
    if any(kw in desc for kw in keywords):
        return False
    gene_parts = [p for p in desc.split("|") if p.strip().startswith("gene=")]
    if gene_parts and "api" in gene_parts[0].lower():
        return False
    return True

for fasta_fp in src_dir.glob("*.fasta"):
    counts = filter_counts(fasta_fp)
    print(f"\n=== {fasta_fp.name} ===")
    print(f"  total sequences: {counts['total']}")
    print(f"  after step1 (non-pseudo): {counts['step1_non_pseudo']}")
    print(f"  after step2 (isoform 1): {counts['step2_iso1']}")
    print(f"  after step4 (no keyword): {counts['step4_no_keyword']}")
    print(f"  after step5 (no API): {counts['step5_no_api']}\n")

    kept = [rec for rec in SeqIO.parse(fasta_fp, "fasta") if keep_record(rec)]

    out_fa = out_dir / fasta_fp.name
    SeqIO.write(kept, out_fa, "fasta")
    print(f"Wrote {len(kept)} records ‚Üí {out_fa.name}")

    gene_ids = set()
    for rec in kept:
        tok = [p for p in rec.description.split("|") if p.strip().startswith("transcript=")]
        if tok:
            gene_ids.add(tok[0].split("=", 1)[1].split()[0])
    if gene_ids:
        txt_out = out_fa.with_suffix(".geneIDs.txt")
        txt_out.write_text("\n".join(sorted(gene_ids)))
        print(f"  ‚Üí dumped {len(gene_ids)} gene IDs to {txt_out.name}")


In [None]:


parent_gaf_dir = Path("genomes_to_annotate_with_PlasmoFP")
raw_dir        = Path("genomes_to_annotate_with_PlasmoFP")  # unfiltered FASTAs
filtered_dir   = Path("genomes_to_annotate_with_PlasmoFP/2_multi_step_filtered_fastas")
embed_dir      = Path("2_multi_step_filtered_fastas_embeddings_output")
gaf_out_dir    = Path("genomes_to_annotate_with_PlasmoFP/gaf_out_complete_and_filtered_2")
dict_out_dir   = Path("genomes_to_annotate_with_PlasmoFP/gene_dicts_out_complete_and_filtered_2")

print("Raw FASTA dir :", raw_dir)
print("Filtered dir  :", filtered_dir)
print("Found filtered FASTAs:", [p.name for p in filtered_dir.glob("*.fasta")])

# ensure outputs exist
gaf_out_dir.mkdir(parents=True, exist_ok=True)
dict_out_dir.mkdir(parents=True, exist_ok=True)

# copy only the *_GO.gaf.gz from PlasmoDB into gaf_out_dir
for g in parent_gaf_dir.glob("*_GO.gaf.gz"):
    shutil.copy2(g, gaf_out_dir / g.name)

# mapping from GAF aspect code to human‚Äêreadable
aspect_map = {"c":"Component","f":"Function","p":"Process"}

# your filter helper (unchanged)
keywords = [
    "emp1","merozoite surface protein","msp","rifin","stevor","fragment",
    "vir","variant interspersed repeat","variant interspersed repeats",
    "fam-","pir","cir","yir","surf"
]

def passes_filters(rec):
    desc = rec.description.lower()
    if "is_pseudo=false" not in desc: return False
    if "transcript=" not in desc:      return False
    tid = desc.split("transcript=")[1].split()[0].lower()
    if not (tid.endswith(".1") or tid.endswith("_1") or tid.endswith("_t1")): return False
    if any(kw in desc for kw in keywords): return False
    gp = [p for p in desc.split("|") if p.strip().startswith("gene=")]
    if gp and "api" in gp[0].lower(): return False
    return True

def safe_load_embeddings(embed_dir, embedding_stem):
    """Load embeddings from NPZ, pickle, or numpy file with proper error handling"""
    # Try NPZ first (most reliable)
    npz_fp = embed_dir / f"{embedding_stem}_embeddings.npz"
    if npz_fp.exists():
        try:
            npz_data = np.load(npz_fp)
            embedding_dict = {key: npz_data[key] for key in npz_data.files}
            print(f"   ‚Ä¢ loaded NPZ embeddings for {len(embedding_dict)} sequences")
            return embedding_dict, "npz"
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading NPZ: {e}")
    
    # Try pickle (direct ID mapping)
    pkl_fp = embed_dir / f"{embedding_stem}_embeddings.pkl"
    if pkl_fp.exists():
        try:
            with open(pkl_fp, "rb") as f:
                embedding_dict = pickle.load(f)
            print(f"   ‚Ä¢ loaded pickle embeddings for {len(embedding_dict)} sequences")
            return embedding_dict, "pickle"
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading pickle: {e}")
    
    # Fallback to numpy array (requires order-based alignment)
    npy_fp = embed_dir / f"{embedding_stem}_embeddings.npy"
    if npy_fp.exists():
        try:
            emb_array = np.load(npy_fp)
            print(f"   ‚Ä¢ loaded numpy array embeddings: {emb_array.shape}")
            return emb_array, "numpy"
        except Exception as e:
            print(f"  ‚ö†Ô∏è  Error loading numpy: {e}")
    
    return None, None

# iterate over each filtered FASTA
for filtered_fp in sorted(filtered_dir.glob("*.fasta")):
    full_stem = filtered_fp.stem
    base      = full_stem.rsplit("_AnnotatedProteins", 1)[0]
    print(f"\n‚ñ∂Ô∏è  Processing {base}")

    # 1) load embeddings using the safe loader
    embedding_stem = full_stem
    embeddings_data, embedding_format = safe_load_embeddings(embed_dir, embedding_stem)
    
    if embeddings_data is None:
        print(f"  ‚ö†Ô∏è  no embeddings found for {embedding_stem}, skipping")
        continue

    # 2) load filtered sequences
    seqs_filt = list(SeqIO.parse(str(filtered_fp), "fasta"))
    print(f"   ‚Ä¢ filtered seqs: {len(seqs_filt)}")

    # 3) build embedding dictionary based on format
    if embedding_format in ["npz", "pickle"]:
        # Direct ID-to-embedding mapping
        embedding_dict = embeddings_data
        print(f"   ‚Ä¢ using direct ID mapping from {embedding_format}")
        
        # Verify coverage
        missing_embeddings = 0
        for rec in seqs_filt:
            if rec.id not in embedding_dict:
                missing_embeddings += 1
        
        if missing_embeddings > 0:
            print(f"  ‚ö†Ô∏è  {missing_embeddings} sequences missing embeddings")
        
    elif embedding_format == "numpy":
        # Order-based alignment (less reliable)
        emb_array = embeddings_data
        print(f"   ‚Ä¢ using order-based alignment from numpy array")
        
        if len(seqs_filt) != emb_array.shape[0]:
            print(f"  ‚ùå  Count mismatch: {len(seqs_filt)} seqs vs {emb_array.shape[0]} embeddings")
            continue
        
        # Build dictionary using order
        embedding_dict = {}
        for idx, rec in enumerate(seqs_filt):
            embedding_dict[rec.id] = emb_array[idx]

    # 4) load raw sequences
    raw_fp = raw_dir / f"{base}_AnnotatedProteins.fasta"
    if not raw_fp.exists():
        print(f"  ‚ö†Ô∏è  raw FASTA not found: {raw_fp.name}, skipping")
        continue
    seqs_raw = list(SeqIO.parse(str(raw_fp), "fasta"))

    print(f"   ‚Ä¢ raw seqs: {len(seqs_raw)}    embedding_dict: {len(embedding_dict)}")

    # 5) build gene_dict_complete from the raw seqs WITH embeddings where available
    gene_dict_complete = {}
    for rec in seqs_raw:
        # Direct lookup by sequence ID (most reliable)
        emb_vec = embedding_dict.get(rec.id)  # None if not found
        
        gene_dict_complete[rec.id] = {
            "embedding": emb_vec,  # Will be None for sequences without embeddings
            **{k: [] for k in [
                "GO Component","GO Function","GO Process",
                "GO IEA Component","GO IEA Function","GO IEA Process"
            ]}
        }

    # 6) merge GAF annotations into gene_dict_complete - CORRECTED VERSION
    gaf_fp = gaf_out_dir / f"{base}_GO.gaf.gz"
    if not gaf_fp.exists():
        print(f"  ‚ö†Ô∏è  GAF missing for {base}, skipping")
        continue

    # Build mapping from gene ID to protein IDs (NEW APPROACH)
    gene_to_protein_ids = {}
    proteins_with_gene_ids = 0
    
    for rec in seqs_raw:
        desc = rec.description
        if "gene=" in desc:
            try:
                # Extract gene ID from FASTA description
                gene_id = desc.split("gene=")[1].split()[0]
                if gene_id not in gene_to_protein_ids:
                    gene_to_protein_ids[gene_id] = []
                gene_to_protein_ids[gene_id].append(rec.id)
                proteins_with_gene_ids += 1
            except (IndexError, AttributeError):
                # Skip if gene ID extraction fails
                continue
    
    print(f"   ‚Ä¢ proteins with gene IDs: {proteins_with_gene_ids}")
    print(f"   ‚Ä¢ unique gene IDs: {len(gene_to_protein_ids)}")

    # Process GAF file with improved matching
    gaf_annotations_added = 0
    gaf_lines_processed = 0
    
    with gzip.open(gaf_fp, "rt") as gf:
        for line in gf:
            if line.startswith("!"): 
                continue
            
            try:
                cols = line.rstrip().split("\t")
                if len(cols) < 9:  # Ensure we have enough columns
                    continue
                    
                gid, term, asp, ev = cols[1], cols[4], cols[8].lower(), cols[6]
                key = ("GO IEA " if ev == "IEA" else "GO ") + aspect_map[asp]
                
                # Use proper gene ID matching instead of substring search
                if gid in gene_to_protein_ids:
                    for protein_id in gene_to_protein_ids[gid]:
                        if protein_id in gene_dict_complete:
                            gene_dict_complete[protein_id][key].append(term)
                            gaf_annotations_added += 1
                
                gaf_lines_processed += 1
                
            except (IndexError, KeyError, ValueError):
                # Skip malformed lines
                continue
    
    print(f"   ‚Ä¢ GAF lines processed: {gaf_lines_processed}")
    print(f"   ‚Ä¢ annotations added: {gaf_annotations_added}")

    # 7) build gene_dict_filtered: subset of complete (embeddings already attached)
    gene_dict_filtered = {}
    for rec in seqs_filt:
        if passes_filters(rec):
            if rec.id in gene_dict_complete:
                gene_dict_filtered[rec.id] = gene_dict_complete[rec.id].copy()

    # 8) save both dictionaries
    full_out = dict_out_dir / f"{base}_gene_dict_complete.pkl"
    filt_out = dict_out_dir / f"{base}_gene_dict_filtered.pkl"
    
    with open(full_out, "wb") as f:
        pickle.dump(gene_dict_complete, f)
    with open(filt_out, "wb") as f:
        pickle.dump(gene_dict_filtered, f)

    # Count statistics
    complete_with_emb = sum(1 for entry in gene_dict_complete.values() if entry["embedding"] is not None)
    filtered_with_emb = sum(1 for entry in gene_dict_filtered.values() if entry["embedding"] is not None)
    
    # Count GO annotations
    complete_with_go = sum(1 for entry in gene_dict_complete.values() 
                          if any(len(entry[k]) > 0 for k in ["GO Component", "GO Function", "GO Process",
                                                            "GO IEA Component", "GO IEA Function", "GO IEA Process"]))
    filtered_with_go = sum(1 for entry in gene_dict_filtered.values() 
                          if any(len(entry[k]) > 0 for k in ["GO Component", "GO Function", "GO Process",
                                                            "GO IEA Component", "GO IEA Function", "GO IEA Process"]))
    
    print(f"‚úÖ {base}: complete={len(gene_dict_complete)} ({complete_with_emb} with embeddings, {complete_with_go} with GO)")
    print(f"   filtered={len(gene_dict_filtered)} ({filtered_with_emb} with embeddings, {filtered_with_go} with GO)")
    print(f"   ‚Ä¢ embedding format used: {embedding_format}")

print("\nüéâ Gene dictionary generation completed!")

In [None]:
import pickle
import pandas as pd
from pathlib import Path

def analyze_annotation_completeness(dict_dir):
    """
    Analyze GO annotation completeness across all gene dictionaries.
    
    Returns DataFrame with counts of:
    - Unannotated: No GO annotations in any subontology
    - Partially annotated: At least 1 annotation in any subontology 
    - Fully annotated: Annotations in all 3 subontologies (MF, BP, CC)
    """
    
    # Find all gene dictionary files
    dict_files = list(dict_dir.glob("*_gene_dict_complete.pkl"))
    
    results = []
    
    GO_KEYS = [
        ("GO Function", "GO IEA Function"),      # MF
        ("GO Process", "GO IEA Process"),        # BP  
        ("GO Component", "GO IEA Component"),    # CC
    ]
    
    for dict_file in sorted(dict_files):
        # Extract species name
        species = dict_file.stem.replace("_gene_dict_complete", "")
        
        # Load dictionary
        with open(dict_file, "rb") as f:
            gene_dict = pickle.load(f)
        
        # Initialize counters
        unannotated = 0
        partially_annotated = 0
        fully_annotated = 0
        
        # Analyze each protein
        for protein_id, protein_data in gene_dict.items():
            # Check annotation status for each subontology
            subontology_counts = []
            
            for go_key, go_iea_key in GO_KEYS:
                # Count annotations in this subontology (combining GO and IEA)
                go_terms = protein_data.get(go_key, [])
                go_iea_terms = protein_data.get(go_iea_key, [])
                
                # Handle different data types (list, set, or single values)
                if isinstance(go_terms, str):
                    go_set = {go_terms} if go_terms else set()
                elif hasattr(go_terms, '__iter__'):
                    go_set = set(go_terms)
                else:
                    go_set = set()
                
                if isinstance(go_iea_terms, str):
                    go_iea_set = {go_iea_terms} if go_iea_terms else set()
                elif hasattr(go_iea_terms, '__iter__'):
                    go_iea_set = set(go_iea_terms)
                else:
                    go_iea_set = set()
                
                # Combined annotation count for this subontology
                combined_count = len(go_set | go_iea_set)
                subontology_counts.append(combined_count)
            
            # Determine annotation completeness
            annotated_subontologies = sum(1 for count in subontology_counts if count > 0)
            
            if annotated_subontologies == 0:
                unannotated += 1
            elif annotated_subontologies == 3:
                fully_annotated += 1
            else:  # 1 or 2 subontologies
                partially_annotated += 1
        
        # Calculate percentages
        total_proteins = len(gene_dict)
        unannotated_pct = (unannotated / total_proteins) * 100
        partially_pct = (partially_annotated / total_proteins) * 100
        fully_pct = (fully_annotated / total_proteins) * 100
        
        # Store results
        results.append({
            'Species': species,
            'Total_Proteins': total_proteins,
            'Unannotated_Count': unannotated,
            'Partially_Annotated_Count': partially_annotated,
            'Fully_Annotated_Count': fully_annotated,
            'Unannotated_Percent': round(unannotated_pct, 2),
            'Partially_Annotated_Percent': round(partially_pct, 2),
            'Fully_Annotated_Percent': round(fully_pct, 2)
        })
        
        print(f"‚úÖ {species}: {total_proteins} proteins")
        print(f"   Unannotated: {unannotated} ({unannotated_pct:.1f}%)")
        print(f"   Partial: {partially_annotated} ({partially_pct:.1f}%)")
        print(f"   Full: {fully_annotated} ({fully_pct:.1f}%)")
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Add summary statistics
    print(f"\nüìä SUMMARY ACROSS ALL SPECIES:")
    print(f"Total species analyzed: {len(df)}")
    print(f"Average unannotated: {df['Unannotated_Percent'].mean():.1f}%")
    print(f"Average partial: {df['Partially_Annotated_Percent'].mean():.1f}%")
    print(f"Average full: {df['Fully_Annotated_Percent'].mean():.1f}%")
    
    return df

def detailed_subontology_analysis(dict_dir):
    """
    More detailed analysis breaking down by individual subontologies.
    
    Returns DataFrame with annotation counts for MF, BP, CC individually.
    """
    
    dict_files = list(dict_dir.glob("*_gene_dict_complete.pkl"))
    results = []
    
    GO_KEYS = {
        'MF': ("GO Function", "GO IEA Function"),
        'BP': ("GO Process", "GO IEA Process"), 
        'CC': ("GO Component", "GO IEA Component")
    }
    
    for dict_file in sorted(dict_files):
        species = dict_file.stem.replace("_gene_dict_complete", "")
        
        with open(dict_file, "rb") as f:
            gene_dict = pickle.load(f)
        
        # Count annotations per subontology
        subontology_counts = {onto: 0 for onto in GO_KEYS.keys()}
        
        for protein_id, protein_data in gene_dict.items():
            for onto, (go_key, go_iea_key) in GO_KEYS.items():
                go_terms = set(protein_data.get(go_key, []))
                go_iea_terms = set(protein_data.get(go_iea_key, []))
                
                # Remove empty strings/None
                go_terms.discard('')
                go_terms.discard(None)
                go_iea_terms.discard('')
                go_iea_terms.discard(None)
                
                if len(go_terms | go_iea_terms) > 0:
                    subontology_counts[onto] += 1
        
        total = len(gene_dict)
        result = {
            'Species': species,
            'Total_Proteins': total,
            'MF_Annotated': subontology_counts['MF'],
            'BP_Annotated': subontology_counts['BP'],
            'CC_Annotated': subontology_counts['CC'],
            'MF_Percent': round((subontology_counts['MF'] / total) * 100, 2),
            'BP_Percent': round((subontology_counts['BP'] / total) * 100, 2),
            'CC_Percent': round((subontology_counts['CC'] / total) * 100, 2)
        }
        
        results.append(result)
    
    return pd.DataFrame(results)

# Usage example:
if __name__ == "__main__":
    # Set path to gene dictionaries
    dict_dir = Path("genomes_to_annotate_with_PlasmoFP/gene_dicts_out_complete_and_filtered_2")
    
    # Generate baseline annotation completeness analysis
    print("üî¨ BASELINE GO ANNOTATION ANALYSIS")
    print("="*50)
    
    baseline_df = analyze_annotation_completeness(dict_dir)
    
    # Save results
    baseline_df.to_csv("baseline_annotation_completeness.csv", index=False)
    print(f"\nüíæ Saved baseline analysis to: baseline_annotation_completeness.csv")
    
    # Generate detailed subontology analysis
    print(f"\nüîç DETAILED SUBONTOLOGY ANALYSIS")
    print("="*50)
    
    detailed_df = detailed_subontology_analysis(dict_dir)
    detailed_df.to_csv("detailed_subontology_analysis.csv", index=False)
    print(f"\nüíæ Saved detailed analysis to: detailed_subontology_analysis.csv")
    
    # Display summary table
    print(f"\nüìã BASELINE COMPLETENESS SUMMARY:")
    print(baseline_df[['Species', 'Total_Proteins', 'Unannotated_Percent', 
                      'Partially_Annotated_Percent', 'Fully_Annotated_Percent']].to_string(index=False))