# 2025-12-08-cafa6-11-model-ensemble-V0

This just manually merges the predictions from two pre-computed files

In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np

log_df = pd.read_csv("../submission/cafa6-03.tsv", sep="\t", header=None,
                     names=["Protein Id", "GO Term Id", "Prediction"])

mlp_df = pd.read_csv("../submission/cafa6-08-V3.tsv", sep="\t", header=None,
                     names=["Protein Id", "GO Term Id", "Prediction"])

In [2]:
from tqdm import tqdm

merged_dict = defaultdict(lambda: [None, None])  # [mlp_score, log_score]

# --- Add MLP predictions (FAST) ---
for row in tqdm(
    mlp_df.itertuples(index=False),
    total=len(mlp_df),
    desc="Merging MLP TSV"
):
    entry_id = row[0]
    go_term = row[1]
    score = row[2]
    merged_dict[(entry_id, go_term)][0] = score

# --- Add Logistic predictions (FAST) ---
for row in tqdm(
    log_df.itertuples(index=False),
    total=len(log_df),
    desc="Merging Logistic TSV"
):
    entry_id = row[0]
    go_term = row[1]
    score = row[2]
    merged_dict[(entry_id, go_term)][1] = score

Merging MLP TSV: 100%|██████████| 5784652/5784652 [00:11<00:00, 493880.54it/s]
Merging Logistic TSV: 100%|██████████| 5214598/5214598 [00:14<00:00, 354754.99it/s]


In [3]:
w = 0.53   # whatever weight you want

submission_list = []

for (entry_id, go_term), (mlp_score, log_score) in merged_dict.items():

    mlp_score = mlp_score if mlp_score is not None else 0.0
    log_score = log_score if log_score is not None else 0.0

    final_score = w * mlp_score + (1 - w) * log_score
    submission_list.append((entry_id, go_term, round(final_score, 3)))

In [4]:
submission_df = pd.DataFrame(
    submission_list,
    columns=['Protein Id', 'GO Term Id', 'Prediction']
)

# sort
submission_df = submission_df.sort_values(
    by=['Protein Id', 'Prediction'],
    ascending=[True, False]
)

# limit to 1500 predictions per protein
final_submission_df = (
    submission_df.groupby('Protein Id')
    .head(1500)
    .reset_index(drop=True)
)

# save final file
final_submission_df.to_csv("../submission.tsv", sep="\t", index=False, header=False)

In [5]:
print(f"{len(final_submission_df):,}")

7,913,263
