The goal of this notebook is to build dictionaries for the training and SwissProt datasets for use in evaluating the BLAST and Foldseek baselines for use with @build_blast_baseline.py and @build_foldseek_basline.py. 

In [25]:
import pandas as pd 
import re 
import requests
from tqdm import tqdm
import pickle

In [2]:
swissprot = pd.read_csv('blast_foldseek_results/swissprot_8_30_25_go_term_mapping.tsv', sep='\t')
function_train = pd.read_csv('processed_data_90_30/function_train.tsv', sep='\t')
process_train = pd.read_csv('processed_data_90_30/process_train.tsv', sep='\t')
component_train = pd.read_csv('processed_data_90_30/component_train.tsv', sep='\t')

print((len(swissprot), len(function_train), len(process_train), len(component_train)))

(573661, 289930, 227756, 247286)


In [3]:
swissprot.head()

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology (molecular function),Gene Ontology IDs,Gene Ontology (GO),Gene Ontology (biological process),Gene Ontology (cellular component)
0,A0A009IHW8,ABTIR_ACIB9,2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...,J512_3302,Acinetobacter baumannii (strain 1295743),269,NAD+ nucleosidase activity [GO:0003953]; NAD+ ...,GO:0003953; GO:0007165; GO:0019677; GO:0050135...,NAD+ nucleosidase activity [GO:0003953]; NAD+ ...,NAD catabolic process [GO:0019677]; signal tra...,
1,A0A023I7E1,ENG1_RHIMI,"Glucan endo-1,3-beta-D-glucosidase 1 (Endo-1,3...",ENG1 LAM81A,Rhizomucor miehei,796,"endo-1,3(4)-beta-glucanase activity [GO:005286...",GO:0000272; GO:0005576; GO:0042973; GO:0052861...,"extracellular region [GO:0005576]; endo-1,3(4)...",cell wall organization [GO:0071555]; polysacch...,extracellular region [GO:0005576]
2,A0A024B7W1,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423,"4 iron, 4 sulfur cluster binding [GO:0051539];...",GO:0003724; GO:0003725; GO:0003968; GO:0004252...,centrosome [GO:0005813]; extracellular region ...,clathrin-dependent endocytosis of virus by hos...,centrosome [GO:0005813]; extracellular region ...
3,A0A024RXP8,GUX1_HYPJR,"Exoglucanase 1 (EC 3.2.1.91) (1,4-beta-cellobi...",cbh1 M419DRAFT_125125,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,514,"cellulose 1,4-beta-cellobiosidase activity [GO...",GO:0005576; GO:0016162; GO:0030245; GO:0030248,extracellular region [GO:0005576]; cellulose 1...,cellulose catabolic process [GO:0030245],extracellular region [GO:0005576]
4,A0A024SC78,CUTI1_HYPJR,Cutinase (EC 3.1.1.74),M419DRAFT_76732,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,248,cutinase activity [GO:0050525],GO:0005576; GO:0016052; GO:0050525,extracellular region [GO:0005576]; cutinase ac...,carbohydrate catabolic process [GO:0016052],extracellular region [GO:0005576]


In [4]:
function_train.head()

Unnamed: 0,Cluster_2,Entry,Entry Name,Organism,Sequence,Gene Ontology (molecular function),Gene Ontology (biological process),Gene Ontology (cellular component),Pfam,InterPro,GOAssertion,Cluster,Raw GO terms,Raw propagated GO terms
0,A0A023B1D3,A0A023B1D3,A0A023B1D3_GRENI,Gregarina niphandrodes (Septate eugregarine),MDLGRVEELGKAVSLHYTIPTIHVTGTNGKGSVVSLISSMLMATGL...,ATP binding [GO:0005524]; dihydrofolate syntha...,,cytosol [GO:0005829]; mitochondrion [GO:0005739],,IPR001645;IPR018109;IPR036565;,AA,A0A023B1D3,"['GO:0005524', 'GO:0008841', 'GO:0004326']","['GO:0016879', 'GO:0003824', 'GO:0032559', 'GO..."
1,A0A023B4B6,A0A023B4B6,A0A023B4B6_GRENI,Gregarina niphandrodes (Septate eugregarine),MSLAASANTADLRYPESTSHLRKWIASSRTELQRIKEVFFRRSLVT...,structural constituent of ribosome [GO:0003735],translation [GO:0006412],cytosolic small ribosomal subunit [GO:0022627],PF00164;,IPR036915;IPR012340;IPR006032;IPR005680;,AA,A0A023B4B6,['GO:0003735'],"['GO:0003674', 'GO:0005198']"
2,A0A023B4B6,A0A060D6I3,A0A060D6I3_9EUKA,Lotharella oceanica,MGKPSGINSGNRILKNLTKHRSADPGYIKRIYHQIFYRPFGGAPHA...,structural constituent of ribosome [GO:0003735],translation [GO:0006412],cytosolic small ribosomal subunit [GO:0022627]...,PF00164;,IPR012340;IPR006032;IPR005680;,AA,A0A7S3Z0E3,['GO:0003735'],"['GO:0003674', 'GO:0005198']"
3,A0A023B4B6,A0A0H5BHB4,A0A0H5BHB4_9EUKA,Lotharella vacuolata,MGKPSGINAGNRILKNLTKHRRADKSYKKRIYHQIFYRPFGGAPHA...,structural constituent of ribosome [GO:0003735],translation [GO:0006412],cytosolic small ribosomal subunit [GO:0022627]...,PF00164;,IPR012340;IPR006032;,AA,A0A0H5BHB4,['GO:0003735'],"['GO:0003674', 'GO:0005198']"
4,A0A023B4T9,A0A023B4T9,A0A023B4T9_GRENI,Gregarina niphandrodes (Septate eugregarine),MTLHLPDPEGNMSKPKQAVTEATVQKPVAREGHCSYYVKSRRYFCR...,metal ion binding [GO:0046872]; tRNA 2'-O-meth...,tRNA methylation [GO:0030488],,PF05206;PF11722;PF05253;,IPR007871;IPR039044;IPR022776;IPR021721;,AA,A0A023B4T9,"['GO:0046872', 'GO:0106050']","['GO:0016740', 'GO:0008175', 'GO:0008168', 'GO..."


First split SwissProt by ontology 

In [7]:
swissprot_MF.iloc[0]['Gene Ontology (molecular function)']

'NAD+ nucleosidase activity [GO:0003953]; NAD+ nucleosidase activity, cyclic ADP-ribose generating [GO:0061809]; NADP+ nucleosidase activity [GO:0050135]'

In [None]:
swissprot_MF = swissprot[swissprot['Gene Ontology (molecular function)'].notna()]
swissprot_BP = swissprot[swissprot['Gene Ontology (biological process)'].notna()]
swissprot_CC = swissprot[swissprot['Gene Ontology (cellular component)'].notna()]

swissprot_MF["GO_IDs_extracted"] = swissprot_MF["Gene Ontology (molecular function)"].str.findall(r'GO:\d+')
swissprot_BP["GO_IDs_extracted"] = swissprot_BP["Gene Ontology (biological process)"].str.findall(r'GO:\d+')
swissprot_CC["GO_IDs_extracted"] = swissprot_CC["Gene Ontology (cellular component)"].str.findall(r'GO:\d+')

print(len(swissprot_MF), len(swissprot_BP), len(swissprot_CC))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swissprot_MF["GO_IDs_extracted"] = swissprot_MF["Gene Ontology (molecular function)"].str.findall(r'GO:\d+')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swissprot_BP["GO_IDs_extracted"] = swissprot_BP["Gene Ontology (biological process)"].str.findall(r'GO:\d+')


492124 465666 467660


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swissprot_CC["GO_IDs_extracted"] = swissprot_CC["Gene Ontology (cellular component)"].str.findall(r'GO:\d+')


In [10]:
swissprot_MF_go_ids_unique = swissprot_MF['GO_IDs_extracted'].explode().unique()
swissprot_BP_go_ids_unique = swissprot_BP['GO_IDs_extracted'].explode().unique()
swissprot_CC_go_ids_unique = swissprot_CC['GO_IDs_extracted'].explode().unique()

print(len(swissprot_MF_go_ids_unique), len(swissprot_BP_go_ids_unique), len(swissprot_CC_go_ids_unique))


8110 17516 2879


In [None]:
base_url = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms"

def get_ancestors(go_terms_batch):
    terms = "%2C".join(go_terms_batch)  
    request_url = f"{base_url}/{terms}/ancestors?relations=is_a"
    
    try:
        response = requests.get(request_url, headers={"Accept": "application/json"})
        response.raise_for_status()  

        data = response.json()
        
        ancestors_dict_batch = {}
        json_dict_batch = {}
        for result in data.get("results", []):
            term = result.get("id")
            ancestors = result.get("ancestors", [])
            ancestors_dict_batch[term] = ancestors if ancestors else None
            json_dict_batch[term] = result  

        return json_dict_batch, ancestors_dict_batch

    except requests.exceptions.RequestException as e:
        print(f"Error fetching ancestors for {go_terms_batch}: {e}")
        return None, None

def process_go_terms_in_batches(go_terms, batch_size=20):
    function_ancestors_dict = {}      
    function_json_dict = {}           

    for i in tqdm(range(0, len(go_terms), batch_size)):
        batch = go_terms[i:i + batch_size]  
        json_data_batch, ancestors_dict_batch = get_ancestors(batch)

        if json_data_batch and ancestors_dict_batch:
            function_json_dict.update(json_data_batch)  
            function_ancestors_dict.update(ancestors_dict_batch)  

    return function_json_dict, function_ancestors_dict

batch_size = 50  

swissprot_MF_function_json_dict, swissprot_MF_function_ancestors_dict = process_go_terms_in_batches(swissprot_MF_go_ids_unique, batch_size=batch_size)
swissprot_BP_function_json_dict, swissprot_BP_function_ancestors_dict = process_go_terms_in_batches(swissprot_BP_go_ids_unique, batch_size=batch_size)
swissprot_CC_function_json_dict, swissprot_CC_function_ancestors_dict = process_go_terms_in_batches(swissprot_CC_go_ids_unique, batch_size=batch_size)


100%|██████████| 163/163 [01:25<00:00,  1.91it/s]
100%|██████████| 351/351 [03:10<00:00,  1.84it/s]
100%|██████████| 58/58 [00:29<00:00,  1.97it/s]


In [None]:
def collect_ancestors(go_ids, ancestors_dict):
    out = []
    if not isinstance(go_ids, (list, tuple)):
        return out
    for go_id in go_ids:
        anc = ancestors_dict.get(go_id) or []
        out.extend(anc)
    return list(dict.fromkeys(out))

swissprot_MF_go_ids_with_ancestors = swissprot_MF['GO_IDs_extracted'].apply(lambda x: collect_ancestors(x, swissprot_MF_function_ancestors_dict))
swissprot_BP_go_ids_with_ancestors = swissprot_BP['GO_IDs_extracted'].apply(lambda x: collect_ancestors(x, swissprot_BP_function_ancestors_dict))
swissprot_CC_go_ids_with_ancestors = swissprot_CC['GO_IDs_extracted'].apply(lambda x: collect_ancestors(x, swissprot_CC_function_ancestors_dict))

print(len(swissprot_MF_go_ids_with_ancestors), len(swissprot_BP_go_ids_with_ancestors), len(swissprot_CC_go_ids_with_ancestors))

492124 465666 467660


In [None]:
with open('processed_data_90_30/process_mlb.pkl', 'rb') as f:
    process_mlb = pickle.load(f)

with open('processed_data_90_30/function_mlb.pkl', 'rb') as f:
    function_mlb = pickle.load(f)

with open('processed_data_90_30/component_mlb.pkl', 'rb') as f:
    component_mlb = pickle.load(f)

print(len(process_mlb.classes_), len(function_mlb.classes_), len(component_mlb.classes_))

1548 744 255




In [44]:
def keep_allowed(lst, allowed):
    if not isinstance(lst, (list, tuple, set)):
        return []
    # dedupe but preserve order
    out = [x for x in lst if x in allowed]
    return list(dict.fromkeys(out))

allowed_func = set(map(str, function_mlb.classes_))
allowed_proc = set(map(str, process_mlb.classes_))
allowed_comp = set(map(str, component_mlb.classes_))

swissprot_ids_MF = swissprot_MF['Entry'].tolist()
swissprot_ids_BP = swissprot_BP['Entry'].tolist()
swissprot_ids_CC = swissprot_CC['Entry'].tolist()

swissprot_MF_go_ids_with_ancestors_filtered = swissprot_MF_go_ids_with_ancestors.apply(lambda xs: keep_allowed(xs, allowed_func))
swissprot_BP_go_ids_with_ancestors_filtered = swissprot_BP_go_ids_with_ancestors.apply(lambda xs: keep_allowed(xs, allowed_proc))
swissprot_CC_go_ids_with_ancestors_filtered = swissprot_CC_go_ids_with_ancestors.apply(lambda xs: keep_allowed(xs, allowed_comp))

assert len(swissprot_ids_MF) == len(swissprot_MF_go_ids_with_ancestors_filtered)
assert len(swissprot_ids_BP) == len(swissprot_BP_go_ids_with_ancestors_filtered)
assert len(swissprot_ids_CC) == len(swissprot_CC_go_ids_with_ancestors_filtered)

swissprot_MF_progated_filtered_mlb = dict(zip(swissprot_ids_MF, swissprot_MF_go_ids_with_ancestors_filtered))
swissprot_BP_progated_filtered_mlb = dict(zip(swissprot_ids_BP, swissprot_BP_go_ids_with_ancestors_filtered))
swissprot_CC_progated_filtered_mlb = dict(zip(swissprot_ids_CC, swissprot_CC_go_ids_with_ancestors_filtered))


assert len(swissprot_ids_MF) == len(swissprot_MF_go_ids_with_ancestors)
assert len(swissprot_ids_BP) == len(swissprot_BP_go_ids_with_ancestors)
assert len(swissprot_ids_CC) == len(swissprot_CC_go_ids_with_ancestors)

swissprot_MF_propogated_unfiltered = dict(zip(swissprot_ids_MF, swissprot_MF_go_ids_with_ancestors))
swissprot_BP_propogated_unfiltered = dict(zip(swissprot_ids_BP, swissprot_BP_go_ids_with_ancestors))
swissprot_CC_propogated_unfiltered = dict(zip(swissprot_ids_CC, swissprot_CC_go_ids_with_ancestors))

swissprot_MF_progated_filtered_mlb = {k: v for k, v in swissprot_MF_progated_filtered_mlb.items() if v}
swissprot_BP_progated_filtered_mlb = {k: v for k, v in swissprot_BP_progated_filtered_mlb.items() if v}
swissprot_CC_progated_filtered_mlb = {k: v for k, v in swissprot_CC_progated_filtered_mlb.items() if v}

swissprot_MF_propogated_unfiltered = {k: v for k, v in swissprot_MF_propogated_unfiltered.items() if v}
swissprot_BP_propogated_unfiltered = {k: v for k, v in swissprot_BP_propogated_unfiltered.items() if v}
swissprot_CC_propogated_unfiltered = {k: v for k, v in swissprot_CC_propogated_unfiltered.items() if v}

print(len(swissprot_MF_progated_filtered_mlb), len(swissprot_BP_progated_filtered_mlb), len(swissprot_CC_progated_filtered_mlb))
print(len(swissprot_MF_propogated_unfiltered), len(swissprot_BP_propogated_unfiltered), len(swissprot_CC_propogated_unfiltered))


491506 464347 467656
491506 464347 467656


In [45]:
with open('blast_foldseek_results/swissprot_MF_progated_filtered_mlb.pkl', 'wb') as f:
    pickle.dump(swissprot_MF_progated_filtered_mlb, f)

with open('blast_foldseek_results/swissprot_BP_progated_filtered_mlb.pkl', 'wb') as f:
    pickle.dump(swissprot_BP_progated_filtered_mlb, f)

with open('blast_foldseek_results/swissprot_CC_progated_filtered_mlb.pkl', 'wb') as f:
    pickle.dump(swissprot_CC_progated_filtered_mlb, f)

with open('blast_foldseek_results/swissprot_MF_propogated_unfiltered.pkl', 'wb') as f:
    pickle.dump(swissprot_MF_propogated_unfiltered, f)

with open('blast_foldseek_results/swissprot_BP_propogated_unfiltered.pkl', 'wb') as f:
    pickle.dump(swissprot_BP_propogated_unfiltered, f)

with open('blast_foldseek_results/swissprot_CC_propogated_unfiltered.pkl', 'wb') as f:
    pickle.dump(swissprot_CC_propogated_unfiltered, f)
    

In [47]:
function_train_ids = function_train['Entry'].tolist()
process_train_ids = process_train['Entry'].tolist()
component_train_ids = component_train['Entry'].tolist()

function_train_go_ids = function_train['Raw propagated GO terms'].tolist()
process_train_go_ids = process_train['Raw propagated GO terms'].tolist()
component_train_go_ids = component_train['Raw propagated GO terms'].tolist()

# Build mapping dicts
function_train_dict = dict(zip(function_train_ids, function_train_go_ids))
process_train_dict = dict(zip(process_train_ids, process_train_go_ids))
component_train_dict = dict(zip(component_train_ids, component_train_go_ids))


with open('blast_foldseek_results/function_train_dict.pkl', 'wb') as f:
    pickle.dump(function_train_dict, f)

with open('blast_foldseek_results/process_train_dict.pkl', 'wb') as f:
    pickle.dump(process_train_dict, f)

with open('blast_foldseek_results/component_train_dict.pkl', 'wb') as f:
    pickle.dump(component_train_dict, f)


In [None]:
def save_dict_as_tsv(d, out_file):
    with open(out_file, "w") as f:
        for seq_id, go_terms in d.items():
            f.write(f"{seq_id}\t{go_terms}\n")  

save_dict_as_tsv(swissprot_MF_progated_filtered_mlb, "blast_foldseek_results/swissprot_MF_filtered.tsv")
save_dict_as_tsv(swissprot_BP_progated_filtered_mlb, "blast_foldseek_results/swissprot_BP_filtered.tsv")
save_dict_as_tsv(swissprot_CC_progated_filtered_mlb, "blast_foldseek_results/swissprot_CC_filtered.tsv")

save_dict_as_tsv(swissprot_MF_propogated_unfiltered, "blast_foldseek_results/swissprot_MF_unfiltered.tsv")
save_dict_as_tsv(swissprot_BP_propogated_unfiltered, "blast_foldseek_results/swissprot_BP_unfiltered.tsv")
save_dict_as_tsv(swissprot_CC_propogated_unfiltered, "blast_foldseek_results/swissprot_CC_unfiltered.tsv")

save_dict_as_tsv(function_train_dict, "blast_foldseek_results/function_train_dict.tsv")
save_dict_as_tsv(process_train_dict, "blast_foldseek_results/process_train_dict.tsv")
save_dict_as_tsv(component_train_dict, "blast_foldseek_results/component_train_dict.tsv")


#Ground truths 

In [50]:
function_test = pd.read_csv('processed_data_90_30/function_test.tsv', sep='\t')
process_test = pd.read_csv('processed_data_90_30/process_test.tsv', sep='\t')
component_test = pd.read_csv('processed_data_90_30/component_test.tsv', sep='\t')

function_test_ids = function_test['Entry'].tolist()
process_test_ids = process_test['Entry'].tolist()
component_test_ids = component_test['Entry'].tolist()

function_test_go_ids = function_test['Raw propagated GO terms'].tolist()
process_test_go_ids = process_test['Raw propagated GO terms'].tolist()
component_test_go_ids = component_test['Raw propagated GO terms'].tolist()

function_test_dict = dict(zip(function_test_ids, function_test_go_ids))
process_test_dict = dict(zip(process_test_ids, process_test_go_ids))
component_test_dict = dict(zip(component_test_ids, component_test_go_ids))

with open('blast_foldseek_results/function_test_dict.pkl', 'wb') as f:
    pickle.dump(function_test_dict, f)

with open('blast_foldseek_results/process_test_dict.pkl', 'wb') as f:
    pickle.dump(process_test_dict, f)

with open('blast_foldseek_results/component_test_dict.pkl', 'wb') as f:
    pickle.dump(component_test_dict, f)