In [1]:
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib
import pickle
from tqdm import tqdm
from Bio import SeqIO
import gc

In [2]:
import sys
sys.path.append('../')
from src.ensemble_utils import ProteinPredictions

In [3]:
def extract_go_terms_and_branches(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        # Match each stanza with [Term] in the OBO file
        stanzas = re.findall(r'\[Term\][\s\S]*?(?=\n\[|$)', content)

    go_terms_dict = {}
    for stanza in stanzas:
        # Extract the GO term ID
        go_id = re.search(r'^id: (GO:\d+)', stanza, re.MULTILINE)
        if go_id:
            go_id = go_id.group(1)

        # Extract the namespace (branch)
        namespace = re.search(r'^namespace: (\w+)', stanza, re.MULTILINE)
        if namespace:
            namespace = namespace.group(1)

        if go_id and namespace:
            # Map the branch abbreviation to the corresponding BPO, CCO, or MFO
            branch_abbr = {'biological_process': 'BPO', 'cellular_component': 'CCO', 'molecular_function': 'MFO'}
            go_terms_dict[go_id] = branch_abbr[namespace]

    return go_terms_dict

file_path = '../input/cafa-5-protein-function-prediction/Train/go-basic.obo'
go_terms_dict = extract_go_terms_and_branches(file_path)

## Ensembling Tomi's best ESM and T5 models

#### LB: 0.5117

In [11]:
protein_predictions = ProteinPredictions()

In [12]:
for l in tqdm(open('../input/ensemble/tomi_v2_t5_0.484.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

9692291it [00:14, 656277.85it/s]


In [13]:
for l in tqdm(open('../input/ensemble/tomi_v9_esm2_0.495.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

9750489it [00:17, 561676.74it/s]


In [14]:
protein_predictions.get_predictions(output_file='../output/ensemble/tomi_v2_and_v9.tsv')

## Ensembling Joni's best ESM and T5 models

#### LB: 0.51415

In [15]:
protein_predictions = ProteinPredictions()

In [16]:
for l in tqdm(open('../input/ensemble/joni_v1_t5_0.478.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

212797500it [05:30, 644235.61it/s]


In [17]:
for l in tqdm(open('../input/ensemble/joni_v8_esm2_f1-0.34_0.509.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

212797500it [06:39, 532224.98it/s]


In [18]:
protein_predictions.get_predictions(output_file='../output/ensemble/joni_v1_and_v8.tsv')

## Ensembling Joni's and Tomi's ensembles

#### LB: 0.51862

In [19]:
protein_predictions = ProteinPredictions()

In [20]:
for l in tqdm(open('../output/ensemble/tomi_v2_and_v9.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

9158397it [00:14, 624659.20it/s]


In [21]:
for l in tqdm(open('../output/ensemble/joni_v1_and_v8.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

17874864it [00:34, 516274.29it/s]


In [22]:
protein_predictions.get_predictions(output_file='../output/ensemble/joni_and_tomi_ensemble.tsv')

## Include Diamond score

#### LB: 0.53991

In [23]:
protein_predictions = ProteinPredictions()

In [24]:
for l in tqdm(open('../output/ensemble/joni_and_tomi_ensemble.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

17874864it [00:26, 665294.63it/s]


In [25]:
for l in tqdm(open('../input/ensemble/diamond_submission_netgo.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

32473384it [01:00, 540831.17it/s]


In [26]:
protein_predictions.get_predictions(output_file='../output/ensemble/ensemble_j_t_models_and_diamond.tsv')

## Include Public methods

- [QuickGO annotations](https://www.kaggle.com/datasets/mtinti/quick-go-2022-03-02) (Some of these are not present in Private test [according to the host](https://www.kaggle.com/competitions/cafa-5-protein-function-prediction/discussion/432529#2397240), so increase in LB may not show in Private LB)
- [Adaluo's model ensemble](https://www.kaggle.com/code/adaluodao/pytorch-keras-etc-3-blend-cafa-metric-etc) (combination of several methods) - LB:0.53605

#### LB: 0.57075

In [27]:
protein_predictions = ProteinPredictions()

In [28]:
for l in tqdm(open('../input/ensemble/quickgo.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[1]
    go=item_list[2].strip()
    score = float(1)
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

4201907it [00:07, 576952.22it/s]


In [30]:
for l in tqdm(open('../input/ensemble/adaluo_ensembling_multiple_methods_pub.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

15924702it [00:24, 654827.99it/s]


In [31]:
for l in tqdm(open('../output/ensemble/ensemble_j_t_models_and_diamond.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

17874864it [00:31, 566392.94it/s]


In [32]:
protein_predictions.get_predictions(output_file='../output/ensemble/QuickGO_Adaluo_J_T_diamond_ensemble.tsv')

## Public 2

- Increase top=42 -> 50 Go Terms to include in sub

#### LB: 0.57033

Only a small effect and made the LB worse.

In [4]:
protein_predictions = ProteinPredictions()

In [5]:
for l in tqdm(open('../input/ensemble/quickgo.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[1]
    go=item_list[2].strip()
    score = float(1)
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

4201907it [00:07, 560477.80it/s]


In [6]:
for l in tqdm(open('../input/ensemble/adaluo_ensembling_multiple_methods_pub.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

15924702it [00:24, 652203.71it/s]


In [7]:
for l in tqdm(open('../output/ensemble/ensemble_j_t_models_and_diamond.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

17874864it [00:31, 576582.32it/s]


In [8]:
protein_predictions.get_predictions(
    output_file='../output/ensemble/QuickGO_Adaluo_J_T_diamond_ensemble_top50.tsv',
    top=50
)

## Public 3

- Reduce weighting for own models from 1:1 to 1:0.5.

#### LB: 0.57308

In [9]:
protein_predictions = ProteinPredictions()

In [10]:
for l in tqdm(open('../input/ensemble/quickgo.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[1]
    go=item_list[2].strip()
    score = float(1)
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

4201907it [00:07, 564458.63it/s]


In [11]:
for l in tqdm(open('../input/ensemble/adaluo_ensembling_multiple_methods_pub.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

15924702it [00:25, 630272.93it/s]


In [12]:
for l in tqdm(open('../output/ensemble/ensemble_j_t_models_and_diamond.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1, 0.5)

17874864it [00:31, 573344.11it/s]


In [13]:
protein_predictions.get_predictions(
    output_file='../output/ensemble/QuickGO_Adaluo_J_T_diamond_ensemble_own-w-0.5.tsv',
)

## Public 4

- Add [Blast predictions](https://www.kaggle.com/datasets/samusram/proteinet-best)
- Keep lower weights in ensembling blast and own ensemble.

#### LB: 0.56276

Drop in LB.

In [14]:
protein_predictions = ProteinPredictions()

In [15]:
for l in tqdm(open('../input/ensemble/quickgo.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[1]
    go=item_list[2].strip()
    score = float(1)
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

4201907it [00:07, 567498.53it/s]


In [16]:
for l in tqdm(open('../input/ensemble/adaluo_ensembling_multiple_methods_pub.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

15924702it [00:25, 627569.83it/s]


In [18]:
for l in tqdm(open('../input/ensemble/blast_submission.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[1]
    go=item_list[2]
    score = float(item_list[3].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1, 0.5)

15013787it [00:28, 526411.30it/s]


In [19]:
for l in tqdm(open('../output/ensemble/ensemble_j_t_models_and_diamond.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1, 0.5)

17874864it [00:32, 547953.27it/s]


In [20]:
protein_predictions.get_predictions(
    output_file='../output/ensemble/QuickGO_Adaluo_blast_J_T_diamond_ensemble_own-w-0.5.tsv',
)

## Public 5

- Add [SiddhVR's predictions](https://www.kaggle.com/code/siddhvr/cafa5-ems2-embeds-with-pytorch/output). These were obtained by training model on BlastP, Sprof, QuickGo and DeeepGoZero
- Keep lower weights in ensembling own ensemble.

#### LB: 0.56751

In [21]:
protein_predictions = ProteinPredictions()

In [22]:
for l in tqdm(open('../input/ensemble/quickgo.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[1]
    go=item_list[2].strip()
    score = float(1)
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 0,1)

4201907it [00:07, 573614.68it/s]


In [23]:
for l in tqdm(open('../input/ensemble/adaluo_ensembling_multiple_methods_pub.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

15924702it [00:24, 641373.17it/s]


In [24]:
for l in tqdm(open('../input/ensemble/siddhvr_best_pub.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1,1)

75255907it [01:04, 1172039.61it/s]


In [25]:
for l in tqdm(open('../output/ensemble/ensemble_j_t_models_and_diamond.tsv')):
    item_list = l.split('\t')
    temp_id = item_list[0]
    go=item_list[1]
    score = float(item_list[2].strip())
    if go in go_terms_dict:
        root = go_terms_dict[go]
        protein_predictions.add_prediction(temp_id, go, score, root, 0, 1, 0.5)

17874864it [00:31, 568973.34it/s]


In [26]:
protein_predictions.get_predictions(
    output_file='../output/ensemble/QuickGO_Adaluo_SiddhVR_J_T_diamond_ensemble_own-w-0.5.tsv',
)