In [1]:
import pandas as pd
import os
import numpy as np
import polars as pl

In [2]:
def get_competition_terms():
    train_terms_df = pl.read_csv('../input/cafa-5-protein-function-prediction/Train/train_terms.tsv', 
                separator='\t', new_columns=['id', 'term','ont'])
    return train_terms_df['term'].unique()

targets = get_competition_terms()
print(len(targets))

31466


In [3]:
def get_competition_test_ids():
    sample_sub = pl.read_csv('../input/cafa-5-protein-function-prediction/sample_submission.tsv', 
                separator='\t', new_columns=['id', 'term','pred'])
    return sample_sub['id'].unique()
ids = get_competition_test_ids()
print(len(ids))

141864


In [4]:
def load_sub_polars(fn, th:float):
    """ Read only rows that contain competition targets and are test proteins """
    # read lazyframe
    df = pl.scan_csv(
        fn,
        separator='\t', 
        has_header=False, 
        new_columns=['Protein Id', 'GO Term Id','Prediction'],
        dtypes={'Prediction':pl.Float32}
    )
    return df.filter((pl.col("GO Term Id").is_in(targets)) & \
                     (pl.col("Prediction") > th) & \
                     (pl.col("Protein Id").is_in(ids)))

## Concatenate DQZ ontology submissions

In [5]:
suffix = '_zero_10'
th = 0.5

In [6]:
df_dqz_bp = load_sub_polars(f'../output/DGZ/bp_th-0.1_deepgozero{suffix}_submission.tsv', th=th)
df_dqz_mf = load_sub_polars(f'../output/DGZ/mf_th-0.1_deepgozero{suffix}_submission.tsv', th=th)
df_dqz_cc = load_sub_polars(f'../output/DGZ/cc_th-0.1_deepgozero{suffix}_submission.tsv', th=th)

In [7]:
# Here we collect the lazyframe to df - takes lots of RAM
df_dqz = pl.concat(
    [df_dqz_bp, df_dqz_cc, df_dqz_mf],
    rechunk=False,
    parallel=False
).collect()

In [8]:
df_dqz.write_csv(
    f'../output/DGZ/th-{th}_deepgozero{suffix}_submission.tsv',
    separator='\t',
    has_header=False,
    float_precision=3
)