In [1]:
import numpy as np
import pandas as pd
import os

### Install Diamond

https://github.com/bbuchfink/diamond/wiki

# Create Diamond DB

In [4]:
diamond_dir = '../../output/diamond'
if not os.path.exists(diamond_dir):
    os.mkdir(diamond_dir)

In [6]:
!../../diamond/diamond makedb \
  --in ../input/cafa-5-protein-function-prediction/Train/train_sequences.fasta \
  --db $diamond_dir/train_data.dmnd

/bin/bash: /home/joni/miniconda3/envs/cafa5/lib/libtinfo.so.6: no version information available (required by /bin/bash)
diamond v2.1.8.162 (C) Max Planck Society for the Advancement of Science, Benjamin Buchfink, University of Tuebingen
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 12
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: ../input/cafa-5-protein-function-prediction/Train/train_sequences.fasta
Opening the database file...  [0.003s]
Loading sequences...  [0.332s]
Masking sequences...  [0.794s]
Writing sequences...  [0.698s]
Hashing sequences...  [0.048s]
Loading sequences...  [0s]
Writing trailer...  [0.037s]
Closing the input file...  [0s]
Closing the database file...  [0.013s]

Database sequences  142246
  Database letters  78752603
     Database hash  e491561cb14a4f3b4cbeb2d58ede2339
        Total time  1

In [19]:
# Run blastp
!../diamond/diamond blastp \
  --more-sensitive \
  -d $diamond_dir/train_data.dmnd \
  -q "../../input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta" \
  --outfmt 6 qseqid sseqid bitscore pident evalue > $diamond_dir/test_diamond_2.res

/bin/bash: /home/joni/miniconda3/envs/cafa5/lib/libtinfo.so.6: no version information available (required by /bin/bash)


## Find matches from the results

In [20]:
test_protein_ids = np.load('../input/t5embeds/test_ids.npy')

In [39]:
diamond_df = pd.read_csv(diamond_dir + '/test_diamond_2.res', sep='\t', names=['qsegid', 'sseqid', 'bitscore', 'pident', 'evalue'])
diamond_df['testid'] = diamond_df['qsegid'].apply(lambda x: str(x).split('\\t')[0])
diamond_df = diamond_df.drop(columns='qsegid')
diamond_df.set_index('testid', inplace=True)
diamond_df.head()

Unnamed: 0_level_0,sseqid,bitscore,pident,evalue
testid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q9CQV8,Q9CQV8,464.0,100.0,1.1e-167
Q9CQV8,P35213,459.0,98.8,1.0500000000000001e-165
Q9CQV8,P31946,458.0,98.8,2.12e-165
Q9CQV8,V9HWD6,458.0,98.8,2.12e-165
Q9CQV8,Q5PRD0,421.0,91.0,1.2e-150


In [45]:
def matches_for_query_id(query_id:str, max_evalue:float=0.001):
    """ Returns a list of array(seq_id, bitscore) matches that have lower evalue than max_evalue """
    #matches = diamond_df[diamond_df.testid == query_id]
    matches = diamond_df.loc[query_id]
    matches = matches[matches.evalue < max_evalue]
    if len(matches) == 0: return []
    
    return list(matches[['sseqid', 'bitscore']].values.tolist()
)
    
matches = matches_for_query_id(test_protein_ids[0])
matches

[['Q9CQV8', 464.0],
 ['P35213', 459.0],
 ['P31946', 458.0],
 ['V9HWD6', 458.0],
 ['Q5PRD0', 421.0],
 ['P63104', 401.0],
 ['Q5ZKC9', 400.0],
 ['P63101', 399.0],
 ['P63102', 399.0],
 ['P68254', 377.0],
 ['P68255', 377.0],
 ['P27348', 377.0],
 ['P29310', 363.0],
 ['Q20655', 362.0],
 ['P41932', 349.0],
 ['P61982', 343.0],
 ['P61983', 343.0],
 ['P61981', 343.0],
 ['Q5F3W6', 342.0],
 ['Q6PC29', 342.0],
 ['Q04917', 340.0],
 ['Q5ZKJ2', 340.0],
 ['P68510', 339.0],
 ['P68511', 339.0],
 ['P31947', 312.0]]

In [41]:
train_terms = pd.read_csv("../../input/cafa-5-protein-function-prediction/Train/train_terms.tsv",sep="\t")
train_terms.set_index('EntryID', inplace=True)
train_terms.head()

Unnamed: 0_level_0,term,aspect
EntryID,Unnamed: 1_level_1,Unnamed: 2_level_1
A0A009IHW8,GO:0008152,BPO
A0A009IHW8,GO:0034655,BPO
A0A009IHW8,GO:0072523,BPO
A0A009IHW8,GO:0044270,BPO
A0A009IHW8,GO:0006753,BPO


In [43]:
unique_train_ids = train_terms.index.unique()
unique_train_ids[:10]

Index(['A0A009IHW8', 'A0A021WW32', 'A0A023FFD0', 'A0A023GPJ3', 'A0A023GPK8',
       'A0A023GQ97', 'A0A023GRW4', 'A0A023GU64', 'A0A023GU65', 'A0A023GUT0'],
      dtype='object', name='EntryID')

In [33]:
from tqdm.auto import tqdm

In [44]:
def terms_for_train_id(train_id:str):
    #train_id_terms = train_terms[train_terms.EntryID == train_id]
    train_id_terms = train_terms.loc[train_id]
    return {
        'BPO' : train_id_terms[train_id_terms.aspect == 'BPO']['term'].values,
        'CCO' : train_id_terms[train_id_terms.aspect == 'CCO']['term'].values,
        'MFO' : train_id_terms[train_id_terms.aspect == 'MFO']['term'].values,
    }

terms_d = terms_for_train_id(matches[0][0])
terms_d

{'BPO': array(['GO:0051234', 'GO:0070727', 'GO:0051649', 'GO:0051641',
        'GO:0009987', 'GO:0071702', 'GO:0051179', 'GO:0006886',
        'GO:0008150', 'GO:0071705', 'GO:0008104', 'GO:0046907',
        'GO:0006605', 'GO:0015031', 'GO:0045184', 'GO:0006810',
        'GO:0033036'], dtype=object),
 'CCO': array(['GO:0005737', 'GO:0005829', 'GO:0005575', 'GO:0005622',
        'GO:0110165'], dtype=object),
 'MFO': array(['GO:0003674', 'GO:0005488', 'GO:0019904', 'GO:0005515'],
       dtype=object)}

In [46]:
precalculated_terms = {
    train_id : terms_for_train_id(train_id) for train_id in tqdm(unique_train_ids, total=len(unique_train_ids))
}

  0%|          | 0/142246 [00:00<?, ?it/s]

In [47]:
with open("../../output/precalculated_terms.json", "w") as outfile:
    json.dump(precalculated_terms, outfile)

In [48]:
precalculated_terms.keys()

In [None]:
# checkpoint
with open("../../output/precalculated_terms.json", "r") as outfile:
    precalculated_terms = json.load(outfile)

## Create submission

See `src/diamond_submit.py`