# NetGO2 Benchmark

We evaluate our method on the NetGO2 benchmark following the [DeepGOZero project](https://github.com/bio-ontology-research-group/deepgozero)


We have collected the data from this Github repository and evaluate with the author's code (evaluate.py) to ensure a fair comparison among the methods

## Importing Modules

In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.spatial import distance
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()

import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'DomainPFP')))

from domaingo_embedding_model import DomainGOEmbeddingModel, load_domaingo_embedding_model_weights
from domain_embedding import DomainEmbedding
from knn_model import *
from data_processing import compute_domain_go_intersection, compute_domain_go_score

## MFO

### Load mapper files

In [2]:
domain_mapper = pickle.load(open('./../data/processed/domain_mapper_netgo_mf.p','rb'))
go_mapper = pickle.load(open('./../data/processed/go_mapper_netgo_mf.p','rb'))

### Load model

In [3]:
mdl_path = './../saved_models/netgo_mf'

mdl = DomainGOEmbeddingModel(domain_mapper, go_mapper)                  # create a model
mdl = load_domaingo_embedding_model_weights(mdl, mdl_path)              # load model weights

dmn_embedding = DomainEmbedding(mdl, domain_mapper)                # domain embedding object

### Prepare data

In [4]:
all_protein_domains = pickle.load(open('./../data/processed/all_protein_domains_netgo_mf_train.p','rb'))
all_protein_go = pickle.load(open('./../data/processed/all_protein_go_netgo_mf_train.p','rb'))
all_protein_domains_valid = pickle.load(open('./../data/processed/all_protein_domains_netgo_mf_valid.p','rb'))
all_protein_go_valid = pickle.load(open('./../data/processed/all_protein_go_netgo_mf_valid.p','rb'))
all_protein_domains_test = pickle.load(open('./../data/processed/all_protein_domains_netgo_mf_test.p','rb'))
all_protein_go_test = pickle.load(open('./../data/processed/all_protein_go_netgo_mf_test.p','rb'))


(X_dmn_embd_train,Y_p_id_train,Y_go_terms_train,X_dmn_embd_valid,Y_p_id_valid,Y_go_terms_valid,X_dmn_embd_test,Y_p_id_test,Y_go_terms_test) = prepare_knn_data(all_protein_domains,all_protein_go,all_protein_domains_valid,all_protein_go_valid,all_protein_domains_test,all_protein_go_test, dmn_embedding)

100%|██████████| 62605/62605 [00:02<00:00, 21678.04it/s]
100%|██████████| 1128/1128 [00:00<00:00, 28821.39it/s]
100%|██████████| 505/505 [00:00<00:00, 24917.93it/s]


### Train KNN model

In [5]:
n_neigh = 1000

knn_mdl = Weighted_KNN_Model(n_neigh)
knn_mdl.train(X_dmn_embd_train,Y_p_id_train)

pickle.dump(knn_mdl,open('../saved_models/knn_netgo_mf.p','wb'))

### Get predictions

In [6]:
go_preds = knn_mdl.get_neighbor_go_terms_proba_batch(Y_go_terms_train, X_dmn_embd_test)

100%|██████████| 505/505 [00:00<00:00, 2384.86it/s]
computing go terms: 100%|██████████| 505/505 [00:02<00:00, 214.46it/s]
100%|██████████| 505/505 [00:00<00:00, 10900.97it/s]


### Convert to DeepGOZero result format

In [7]:
domain_scores = []
domain_blast_scores = []
domain_ppi_scores = []
domain_blast_ppi_scores = []

test_df = pd.read_pickle(f'./../data/netgo_benchmark/mf/test_data.pkl')
blast_ppi_df = pd.read_pickle(f'./../data/netgo_benchmark/mf/blastppi_netgo.pkl')
terms_df = pd.read_pickle(f'./../data/netgo_benchmark/mf/terms.pkl')
terms = terms_df['gos'].values.flatten()
terms_dict = {v: i for i, v in enumerate(terms)}

proteins = blast_ppi_df['proteins'].values

for i in range(len(Y_p_id_test)):
        
    prtn = Y_p_id_test[i]#blast_ppi_df.iloc[i]['proteins']

    prop_annotations = test_df[test_df['proteins']==prtn]['prop_annotations'].values[0]

    domain_preds = np.zeros(len(terms))
    blast_preds = blast_ppi_df[blast_ppi_df['proteins']==prtn]['blast_preds'].values[0]
    ppi_preds = blast_ppi_df[blast_ppi_df['proteins']==prtn]['ppi_preds'].values[0]
    
    for go_trm in go_preds[i]:
        domain_preds[terms_dict[go_trm]] = go_preds[i][go_trm]

    domain_blast_score = np.array(domain_preds)
    normalizer = 1
    if (np.max(blast_preds)>0):                     # there is actually some prediction from blast
            domain_blast_score += np.array(blast_preds)
            normalizer += 1
    domain_blast_score /= normalizer


    domain_ppi_score = np.array(domain_preds)
    normalizer = 1
    if (np.max(ppi_preds)>0):                     # there is actually some prediction from ppi
            domain_ppi_score += np.array(ppi_preds)
            normalizer += 1
    domain_ppi_score /= normalizer

    domain_blast_ppi_score = np.array(domain_preds)
    normalizer = 1    
    if (np.max(blast_preds)>0):                     # there is actually some prediction from blast
            domain_blast_ppi_score += np.array(blast_preds)
            normalizer += 1
    if (np.max(ppi_preds)>0):                     # there is actually some prediction from blast
            domain_blast_ppi_score += np.array(ppi_preds)
            normalizer += 1
    domain_blast_ppi_score /= normalizer


    domain_scores.append([prtn,domain_preds,prop_annotations])
    domain_blast_scores.append([prtn,domain_blast_score,prop_annotations])
    domain_ppi_scores.append([prtn,domain_ppi_score,prop_annotations])
    domain_blast_ppi_scores.append([prtn,domain_blast_ppi_score,prop_annotations])



domain_scores = pd.DataFrame(domain_scores,columns=['proteins','preds','prop_annotations'])
domain_blast_scores = pd.DataFrame(domain_blast_scores,columns=['proteins','preds','prop_annotations'])
domain_ppi_scores = pd.DataFrame(domain_ppi_scores,columns=['proteins','preds','prop_annotations'])
domain_blast_ppi_scores = pd.DataFrame(domain_blast_ppi_scores,columns=['proteins','preds','prop_annotations'])

pickle.dump(domain_scores, open('./../data/netgo_benchmark/mf/predictions_domain.pkl','wb'))
pickle.dump(domain_blast_scores, open('./../data/netgo_benchmark/mf/predictions_domain_blast.pkl','wb'))
pickle.dump(domain_ppi_scores, open('./../data/netgo_benchmark/mf/predictions_domain_ppi.pkl','wb'))
pickle.dump(domain_blast_ppi_scores, open('./../data/netgo_benchmark/mf/predictions_domain_blast_ppi.pkl','wb'))


### Run evaluation 

#### Only Domain

In [8]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont mf --model domain

Computing Fmax
AVG IC 28507.029
Fscore: 0.002085492154902259, Precision: 0.001043836510876069, Recall: 0.9981869615532982 S: 28498.241293151226, RU: 0.09244652631333071, MI: 28498.241293001283 threshold: 0.0, WFmax: 0.0006163457360236614
AVG IC 128.348
Fscore: 0.20336732451467893, Precision: 0.1142000877850835, Recall: 0.9277635320369245 S: 121.01220180008492, RU: 1.5351779407929003, MI: 121.00246366580551 threshold: 0.01, WFmax: 0.1317090948708136
AVG IC 75.427
Fscore: 0.2810818514822161, Precision: 0.16637492744607194, Recall: 0.9051050855542755 S: 68.46020885255902, RU: 1.8877338348521997, MI: 68.43417755116776 threshold: 0.02, WFmax: 0.19662725968089442
AVG IC 54.960
Fscore: 0.337877045862161, Precision: 0.20850123824440636, Recall: 0.890330477555642 S: 48.26910941432648, RU: 2.1415414526910497, MI: 48.221579441766806 threshold: 0.03, WFmax: 0.24543017797037445
AVG IC 43.043
Fscore: 0.3914511054089519, Precision: 0.25172407667338337, Recall: 0.8798237979143494 S: 36.49109714024101,

#### Domain + BLAST

In [9]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont mf --model domain_blast

Computing Fmax
AVG IC 28507.029
Fscore: 0.002085492154902259, Precision: 0.001043836510876069, Recall: 0.9981869615532982 S: 28498.241293151073, RU: 0.09244652631333071, MI: 28498.24129300113 threshold: 0.0, WFmax: 0.0006163457360236608
AVG IC 111.309
Fscore: 0.24349363130274704, Precision: 0.1398611282581596, Recall: 0.9400106627540626 S: 103.69981116455205, RU: 1.2631846835591367, MI: 103.69211734755432 threshold: 0.01, WFmax: 0.1589535193517585
AVG IC 73.454
Fscore: 0.3281719707795337, Precision: 0.1997032459231858, Recall: 0.9200175252696617 S: 66.10620188057023, RU: 1.5147436644327643, MI: 66.08884534250669 threshold: 0.02, WFmax: 0.23342128662303627
AVG IC 55.582
Fscore: 0.3776016623406681, Precision: 0.23839715402723785, Recall: 0.9075185061369794 S: 48.43362708665224, RU: 1.7020454024351583, MI: 48.40371136820959 threshold: 0.03, WFmax: 0.2801357505565758
AVG IC 45.565
Fscore: 0.42102764849584384, Precision: 0.27489985529195987, Recall: 0.8988008560914317 S: 38.53948206814271, 

#### Domain + PPI

In [10]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont mf --model domain_ppi

Computing Fmax
AVG IC 28507.029
Fscore: 0.002085492154902259, Precision: 0.001043836510876069, Recall: 0.9981869615532982 S: 28498.24129315115, RU: 0.09244652631333071, MI: 28498.241293001203 threshold: 0.0, WFmax: 0.0006163457360236608
AVG IC 232.400
Fscore: 0.11282811068578878, Precision: 0.05991008017339506, Recall: 0.9667467316057737 S: 224.44746874240266, RU: 0.9255721393470633, MI: 224.44556030602763 threshold: 0.01, WFmax: 0.07596369320375497
AVG IC 119.997
Fscore: 0.18406874898825198, Precision: 0.10190542735059989, Recall: 0.9501319059007989 S: 112.35113367632925, RU: 1.2281270318893347, MI: 112.34442105574244 threshold: 0.02, WFmax: 0.1357974432372632
AVG IC 80.777
Fscore: 0.2452953259433211, Precision: 0.14114205122358656, Recall: 0.9359997472636894 S: 73.383365819779, RU: 1.4719321179215448, MI: 73.36860224155657 threshold: 0.03, WFmax: 0.18802991929916288
AVG IC 59.541
Fscore: 0.29930556829808175, Precision: 0.17857192799132177, Recall: 0.9240863540258808 S: 52.38029513608

#### Domain + BLAST + PPI

In [11]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont mf --model domain_blast_ppi

Computing Fmax
AVG IC 28507.029
Fscore: 0.002085492154902259, Precision: 0.001043836510876069, Recall: 0.9981869615532982 S: 28498.24129315115, RU: 0.09244652631333071, MI: 28498.241293001203 threshold: 0.0, WFmax: 0.0006163457360236604
AVG IC 186.495
Fscore: 0.1408670392803926, Precision: 0.07595062498238193, Recall: 0.969615315386124 S: 178.3968779124693, RU: 0.7800287254919934, MI: 178.39517259192831 threshold: 0.01, WFmax: 0.09769704393159676
AVG IC 101.884
Fscore: 0.2245146168031924, Precision: 0.1271652522731391, Recall: 0.9575585382798417 S: 94.0114267497298, RU: 1.0019767278485476, MI: 94.00608704842821 threshold: 0.02, WFmax: 0.16610116043459597
AVG IC 70.292
Fscore: 0.2888494902180087, Precision: 0.17057952158705111, Recall: 0.9419275269549385 S: 62.688175697246045, RU: 1.2632326472781623, MI: 62.675446671943554 threshold: 0.03, WFmax: 0.22289052016086305
AVG IC 53.716
Fscore: 0.34356971217688476, Precision: 0.2107086416204826, Recall: 0.9299340540135915 S: 46.31952735560758,

## BPO

### Load mapper files

In [12]:
domain_mapper = pickle.load(open('./../data/processed/domain_mapper_netgo_bp.p','rb'))
go_mapper = pickle.load(open('./../data/processed/go_mapper_netgo_bp.p','rb'))

### Load model

In [13]:
mdl_path = './../saved_models/netgo_bp'

mdl = DomainGOEmbeddingModel(domain_mapper, go_mapper)                  # create a model
mdl = load_domaingo_embedding_model_weights(mdl, mdl_path)              # load model weights

dmn_embedding = DomainEmbedding(mdl, domain_mapper)                # domain embedding object

### Prepare data

In [14]:
all_protein_domains = pickle.load(open('./../data/processed/all_protein_domains_netgo_bp_train.p','rb'))
all_protein_go = pickle.load(open('./../data/processed/all_protein_go_netgo_bp_train.p','rb'))
all_protein_domains_valid = pickle.load(open('./../data/processed/all_protein_domains_netgo_bp_valid.p','rb'))
all_protein_go_valid = pickle.load(open('./../data/processed/all_protein_go_netgo_bp_valid.p','rb'))
all_protein_domains_test = pickle.load(open('./../data/processed/all_protein_domains_netgo_bp_test.p','rb'))
all_protein_go_test = pickle.load(open('./../data/processed/all_protein_go_netgo_bp_test.p','rb'))


(X_dmn_embd_train,Y_p_id_train,Y_go_terms_train,X_dmn_embd_valid,Y_p_id_valid,Y_go_terms_valid,X_dmn_embd_test,Y_p_id_test,Y_go_terms_test) = prepare_knn_data(all_protein_domains,all_protein_go,all_protein_domains_valid,all_protein_go_valid,all_protein_domains_test,all_protein_go_test, dmn_embedding)

100%|██████████| 89828/89828 [00:04<00:00, 22014.81it/s]
100%|██████████| 1124/1124 [00:00<00:00, 25306.36it/s]
100%|██████████| 491/491 [00:00<00:00, 22139.12it/s]


### Train KNN model

In [15]:
n_neigh = 800

knn_mdl = Weighted_KNN_Model(n_neigh)
knn_mdl.train(X_dmn_embd_train,Y_p_id_train)

pickle.dump(knn_mdl,open('../saved_models/knn_netgo_bp.p','wb'))

### Get predictions

In [17]:
go_preds = knn_mdl.get_neighbor_go_terms_proba_batch(Y_go_terms_train, X_dmn_embd_test)

100%|██████████| 491/491 [00:00<00:00, 2990.93it/s]
computing go terms: 100%|██████████| 491/491 [00:10<00:00, 46.75it/s]
100%|██████████| 491/491 [00:00<00:00, 1196.44it/s]


### Convert to DeepGOZero result format

In [18]:
domain_scores = []
domain_blast_scores = []
domain_ppi_scores = []
domain_blast_ppi_scores = []

test_df = pd.read_pickle(f'./../data/netgo_benchmark/bp/test_data.pkl')
blast_ppi_df = pd.read_pickle(f'./../data/netgo_benchmark/bp/blastppi_netgo.pkl')
terms_df = pd.read_pickle(f'./../data/netgo_benchmark/bp/terms.pkl')
terms = terms_df['gos'].values.flatten()
terms_dict = {v: i for i, v in enumerate(terms)}

proteins = blast_ppi_df['proteins'].values

for i in range(len(Y_p_id_test)):
        
    prtn = Y_p_id_test[i]#blast_ppi_df.iloc[i]['proteins']

    prop_annotations = test_df[test_df['proteins']==prtn]['prop_annotations'].values[0]

    domain_preds = np.zeros(len(terms))
    blast_preds = blast_ppi_df[blast_ppi_df['proteins']==prtn]['blast_preds'].values[0]
    ppi_preds = blast_ppi_df[blast_ppi_df['proteins']==prtn]['ppi_preds'].values[0]
    
    for go_trm in go_preds[i]:
        domain_preds[terms_dict[go_trm]] = go_preds[i][go_trm]

    domain_blast_score = np.array(domain_preds)
    normalizer = 1
    if (np.max(blast_preds)>0):                     # there is actually some prediction from blast
            domain_blast_score += np.array(blast_preds)
            normalizer += 1
    domain_blast_score /= normalizer


    domain_ppi_score = np.array(domain_preds)
    normalizer = 1
    if (np.max(ppi_preds)>0):                     # there is actually some prediction from ppi
            domain_ppi_score += np.array(ppi_preds)
            normalizer += 1
    domain_ppi_score /= normalizer

    domain_blast_ppi_score = np.array(domain_preds)
    normalizer = 1    
    if (np.max(blast_preds)>0):                     # there is actually some prediction from blast
            domain_blast_ppi_score += np.array(blast_preds)
            normalizer += 1
    if (np.max(ppi_preds)>0):                     # there is actually some prediction from blast
            domain_blast_ppi_score += np.array(ppi_preds)
            normalizer += 1
    domain_blast_ppi_score /= normalizer


    domain_scores.append([prtn,domain_preds,prop_annotations])
    domain_blast_scores.append([prtn,domain_blast_score,prop_annotations])
    domain_ppi_scores.append([prtn,domain_ppi_score,prop_annotations])
    domain_blast_ppi_scores.append([prtn,domain_blast_ppi_score,prop_annotations])



domain_scores = pd.DataFrame(domain_scores,columns=['proteins','preds','prop_annotations'])
domain_blast_scores = pd.DataFrame(domain_blast_scores,columns=['proteins','preds','prop_annotations'])
domain_ppi_scores = pd.DataFrame(domain_ppi_scores,columns=['proteins','preds','prop_annotations'])
domain_blast_ppi_scores = pd.DataFrame(domain_blast_ppi_scores,columns=['proteins','preds','prop_annotations'])

pickle.dump(domain_scores, open('./../data/netgo_benchmark/bp/predictions_domain.pkl','wb'))
pickle.dump(domain_blast_scores, open('./../data/netgo_benchmark/bp/predictions_domain_blast.pkl','wb'))
pickle.dump(domain_ppi_scores, open('./../data/netgo_benchmark/bp/predictions_domain_ppi.pkl','wb'))
pickle.dump(domain_blast_ppi_scores, open('./../data/netgo_benchmark/bp/predictions_domain_blast_ppi.pkl','wb'))


### Run evaluation 

#### Only Domain

In [19]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont bp --model domain

Computing Fmax
AVG IC 50277.841
Fscore: 0.0027253326047139933, Precision: 0.0013645266394999988, Recall: 0.9994932785755836 S: 50247.76942713187, RU: 0.06749593154448472, MI: 50247.76942708653 threshold: 0.0, WFmax: 0.0011954886359594058
AVG IC 647.655
Fscore: 0.09935452341089715, Precision: 0.05284192338401732, Recall: 0.8294858419603002 S: 625.9388748747268, RU: 8.366702427423514, MI: 625.8829550083066 threshold: 0.01, WFmax: 0.07577679894127816
AVG IC 384.769
Fscore: 0.1436556580898704, Precision: 0.07905909001407518, Recall: 0.7852908125171004 S: 364.93265064707225, RU: 10.161116027138208, MI: 364.79116111740035 threshold: 0.02, WFmax: 0.1128446637609201
AVG IC 287.559
Fscore: 0.1775003998191835, Precision: 0.10059715035686845, Recall: 0.7536131128631216 S: 269.03396827600966, RU: 11.372959664937001, MI: 268.79347439027737 threshold: 0.03, WFmax: 0.1433924650127882
AVG IC 227.524
Fscore: 0.20278073949173056, Precision: 0.11771912520765451, Recall: 0.7309550111699726 S: 209.97424452

#### Domain + BLAST

In [20]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont bp --model domain_blast

Computing Fmax
AVG IC 50277.841
Fscore: 0.0027253326047139933, Precision: 0.0013645266394999988, Recall: 0.9994932785755836 S: 50247.769427131716, RU: 0.06749593154448472, MI: 50247.76942708638 threshold: 0.0, WFmax: 0.001195488635959413
AVG IC 566.440
Fscore: 0.11784981647832861, Precision: 0.06354576009550103, Recall: 0.8103328561551888 S: 545.4382660611677, RU: 9.061607629981241, MI: 545.3629886148981 threshold: 0.01, WFmax: 0.0903534455517457
AVG IC 383.612
Fscore: 0.15747975021777877, Precision: 0.0876261262914853, Recall: 0.7764433053592503 S: 364.14193054446326, RU: 10.516561514241905, MI: 363.9900376584033 threshold: 0.02, WFmax: 0.12391711976996962
AVG IC 297.022
Fscore: 0.18966900190421265, Precision: 0.1084918140211548, Recall: 0.7533522141629997 S: 278.56879471016686, RU: 11.44968534404315, MI: 278.33339377767356 threshold: 0.03, WFmax: 0.15205798416875513
AVG IC 243.056
Fscore: 0.2165690879583276, Precision: 0.12697277774877697, Recall: 0.735713685187844 S: 225.26884943022

#### Domain + PPI

In [21]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont bp --model domain_ppi

Computing Fmax
AVG IC 50277.841
Fscore: 0.0027253326047139933, Precision: 0.0013645266394999988, Recall: 0.9994932785755836 S: 50247.769427131854, RU: 0.06749593154448472, MI: 50247.769427086525 threshold: 0.0, WFmax: 0.0011954886359594027
AVG IC 987.304
Fscore: 0.07139847761337445, Precision: 0.03717300009479966, Recall: 0.9004496306487478 S: 962.3540973524614, RU: 5.175120020157576, MI: 962.3401824842646 threshold: 0.01, WFmax: 0.0539849304193731
AVG IC 550.709
Fscore: 0.10904612890746698, Precision: 0.05821763358644025, Recall: 0.8591539836884324 S: 527.7318731677389, RU: 7.114167315953108, MI: 527.6839191983503 threshold: 0.02, WFmax: 0.08763944962816378
AVG IC 384.936
Fscore: 0.14082232944495063, Precision: 0.07692476880739195, Recall: 0.8315461758380617 S: 363.25958158642914, RU: 8.365624243407575, MI: 363.16324145673894 threshold: 0.03, WFmax: 0.116712252733208
AVG IC 296.585
Fscore: 0.16726851594594183, Precision: 0.09331405685730228, Recall: 0.8062411210155196 S: 275.922348561

#### Domain + BLAST + PPI

In [22]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont bp --model domain_blast_ppi

Computing Fmax
AVG IC 50277.841
Fscore: 0.0027253326047139933, Precision: 0.0013645266394999988, Recall: 0.9994932785755836 S: 50247.76942713162, RU: 0.06749593154448472, MI: 50247.76942708629 threshold: 0.0, WFmax: 0.0011954886359594157
AVG IC 841.782
Fscore: 0.08235252685884585, Precision: 0.043178396360923334, Recall: 0.8880154734206535 S: 817.4166263634593, RU: 5.753239140603382, MI: 817.3963795459399 threshold: 0.01, WFmax: 0.06353430085187876
AVG IC 492.306
Fscore: 0.12349310322148534, Precision: 0.06658985312240426, Recall: 0.8489444220382049 S: 469.8321317907979, RU: 7.603292725406474, MI: 469.7706057245573 threshold: 0.02, WFmax: 0.1000756021016662
AVG IC 353.485
Fscore: 0.1573169800452564, Precision: 0.08700500408507067, Recall: 0.8199449727808357 S: 332.26695178163294, RU: 8.803832470078156, MI: 332.15029697427155 threshold: 0.03, WFmax: 0.13131501756046907
AVG IC 277.570
Fscore: 0.18556962060495574, Precision: 0.1050360731515285, Recall: 0.7954895950205636 S: 257.4142289090

## CCO

### Load mapper files

In [23]:
domain_mapper = pickle.load(open('./../data/processed/domain_mapper_netgo_cc.p','rb'))
go_mapper = pickle.load(open('./../data/processed/go_mapper_netgo_cc.p','rb'))

### Load model

In [24]:
mdl_path = './../saved_models/netgo_cc'

mdl = DomainGOEmbeddingModel(domain_mapper, go_mapper)                  # create a model
mdl = load_domaingo_embedding_model_weights(mdl, mdl_path)              # load model weights

dmn_embedding = DomainEmbedding(mdl, domain_mapper)                # domain embedding object

### Prepare data

In [25]:
all_protein_domains = pickle.load(open('./../data/processed/all_protein_domains_netgo_cc_train.p','rb'))
all_protein_go = pickle.load(open('./../data/processed/all_protein_go_netgo_cc_train.p','rb'))
all_protein_domains_valid = pickle.load(open('./../data/processed/all_protein_domains_netgo_cc_valid.p','rb'))
all_protein_go_valid = pickle.load(open('./../data/processed/all_protein_go_netgo_cc_valid.p','rb'))
all_protein_domains_test = pickle.load(open('./../data/processed/all_protein_domains_netgo_cc_test.p','rb'))
all_protein_go_test = pickle.load(open('./../data/processed/all_protein_go_netgo_cc_test.p','rb'))


(X_dmn_embd_train,Y_p_id_train,Y_go_terms_train,X_dmn_embd_valid,Y_p_id_valid,Y_go_terms_valid,X_dmn_embd_test,Y_p_id_test,Y_go_terms_test) = prepare_knn_data(all_protein_domains,all_protein_go,all_protein_domains_valid,all_protein_go_valid,all_protein_domains_test,all_protein_go_test, dmn_embedding)

100%|██████████| 81377/81377 [00:03<00:00, 23338.44it/s]
100%|██████████| 1359/1359 [00:00<00:00, 26177.08it/s]
100%|██████████| 268/268 [00:00<00:00, 25066.87it/s]


### Train KNN model

In [26]:
n_neigh = 1200

knn_mdl = Weighted_KNN_Model(n_neigh)
knn_mdl.train(X_dmn_embd_train,Y_p_id_train)

pickle.dump(knn_mdl,open('../saved_models/knn_netgo_cc.p','wb'))

### Get predictions

In [27]:
go_preds = knn_mdl.get_neighbor_go_terms_proba_batch(Y_go_terms_train, X_dmn_embd_test)

100%|██████████| 268/268 [00:00<00:00, 2000.10it/s]
computing go terms: 100%|██████████| 268/268 [00:02<00:00, 120.50it/s]
100%|██████████| 268/268 [00:00<00:00, 9979.97it/s]


### Convert to DeepGOZero result format

In [28]:
domain_scores = []
domain_blast_scores = []
domain_ppi_scores = []
domain_blast_ppi_scores = []

test_df = pd.read_pickle(f'./../data/netgo_benchmark/cc/test_data.pkl')
blast_ppi_df = pd.read_pickle(f'./../data/netgo_benchmark/cc/blastppi_netgo.pkl')
terms_df = pd.read_pickle(f'./../data/netgo_benchmark/cc/terms.pkl')
terms = terms_df['gos'].values.flatten()
terms_dict = {v: i for i, v in enumerate(terms)}

proteins = blast_ppi_df['proteins'].values

for i in range(len(Y_p_id_test)):
        
    prtn = Y_p_id_test[i]#blast_ppi_df.iloc[i]['proteins']

    prop_annotations = test_df[test_df['proteins']==prtn]['prop_annotations'].values[0]

    domain_preds = np.zeros(len(terms))
    blast_preds = blast_ppi_df[blast_ppi_df['proteins']==prtn]['blast_preds'].values[0]
    ppi_preds = blast_ppi_df[blast_ppi_df['proteins']==prtn]['ppi_preds'].values[0]
    
    for go_trm in go_preds[i]:
        domain_preds[terms_dict[go_trm]] = go_preds[i][go_trm]

    domain_blast_score = np.array(domain_preds)
    normalizer = 1
    if (np.max(blast_preds)>0):                     # there is actually some prediction from blast
            domain_blast_score += np.array(blast_preds)
            normalizer += 1
    domain_blast_score /= normalizer


    domain_ppi_score = np.array(domain_preds)
    normalizer = 1
    if (np.max(ppi_preds)>0):                     # there is actually some prediction from ppi
            domain_ppi_score += np.array(ppi_preds)
            normalizer += 1
    domain_ppi_score /= normalizer

    domain_blast_ppi_score = np.array(domain_preds)
    normalizer = 1    
    if (np.max(blast_preds)>0):                     # there is actually some prediction from blast
            domain_blast_ppi_score += np.array(blast_preds)
            normalizer += 1
    if (np.max(ppi_preds)>0):                     # there is actually some prediction from blast
            domain_blast_ppi_score += np.array(ppi_preds)
            normalizer += 1
    domain_blast_ppi_score /= normalizer


    domain_scores.append([prtn,domain_preds,prop_annotations])
    domain_blast_scores.append([prtn,domain_blast_score,prop_annotations])
    domain_ppi_scores.append([prtn,domain_ppi_score,prop_annotations])
    domain_blast_ppi_scores.append([prtn,domain_blast_ppi_score,prop_annotations])



domain_scores = pd.DataFrame(domain_scores,columns=['proteins','preds','prop_annotations'])
domain_blast_scores = pd.DataFrame(domain_blast_scores,columns=['proteins','preds','prop_annotations'])
domain_ppi_scores = pd.DataFrame(domain_ppi_scores,columns=['proteins','preds','prop_annotations'])
domain_blast_ppi_scores = pd.DataFrame(domain_blast_ppi_scores,columns=['proteins','preds','prop_annotations'])

pickle.dump(domain_scores, open('./../data/netgo_benchmark/cc/predictions_domain.pkl','wb'))
pickle.dump(domain_blast_scores, open('./../data/netgo_benchmark/cc/predictions_domain_blast.pkl','wb'))
pickle.dump(domain_ppi_scores, open('./../data/netgo_benchmark/cc/predictions_domain_ppi.pkl','wb'))
pickle.dump(domain_blast_ppi_scores, open('./../data/netgo_benchmark/cc/predictions_domain_blast_ppi.pkl','wb'))


### Run evaluation 

#### Only Domain

In [29]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont cc --model domain

Computing Fmax
AVG IC 11741.146
Fscore: 0.008581387728563953, Precision: 0.0043091832504145965, Recall: 1.0 S: 11727.556087269693, RU: 0.0, MI: 11727.556087269693 threshold: 0.0, WFmax: 0.0023122248383630603
AVG IC 161.280
Fscore: 0.23516932979782623, Precision: 0.13459635711698628, Recall: 0.9303288200453852 S: 150.28821701581697, RU: 2.576170377655353, MI: 150.26613563933356 threshold: 0.01, WFmax: 0.14451234196437604
AVG IC 96.133
Fscore: 0.31352621830104815, Precision: 0.18946967280610957, Recall: 0.9081313253080278 S: 85.82710932386138, RU: 3.2236857692521212, MI: 85.76654677058634 threshold: 0.02, WFmax: 0.20936435128001235
AVG IC 71.392
Fscore: 0.35870602376822336, Precision: 0.2245152491862296, Recall: 0.891618497243355 S: 61.54582884980445, RU: 3.636326134905697, MI: 61.438311997075765 threshold: 0.03, WFmax: 0.25366236702737743
AVG IC 58.152
Fscore: 0.3919933571911066, Precision: 0.2523790417115148, Recall: 0.8773214031637876 S: 48.685769151542786, RU: 3.9620801287104803, MI:

#### Domain + BLAST

In [30]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont cc --model domain_blast

Computing Fmax
AVG IC 11741.146
Fscore: 0.008581387728563953, Precision: 0.0043091832504145965, Recall: 1.0 S: 11727.5560872697, RU: 0.0, MI: 11727.5560872697 threshold: 0.0, WFmax: 0.0023122248383630516
AVG IC 139.801
Fscore: 0.27387121584155055, Precision: 0.16069111790896523, Recall: 0.9262834571546804 S: 128.83968789420643, RU: 2.602320862607024, MI: 128.81340420470443 threshold: 0.01, WFmax: 0.17745647971230755
AVG IC 91.780
Fscore: 0.342870070468673, Precision: 0.21129478401711932, Recall: 0.9087696197034945 S: 81.31584781869996, RU: 3.0676617993193793, MI: 81.25796304091661 threshold: 0.02, WFmax: 0.23997681366998314
AVG IC 70.562
Fscore: 0.38378689724618825, Precision: 0.244042932562108, Recall: 0.8980000645107366 S: 60.54230379750338, RU: 3.4703366153734665, MI: 60.44276063256203 threshold: 0.03, WFmax: 0.27845046461239653
AVG IC 59.518
Fscore: 0.4176385720569475, Precision: 0.27290879728533146, Recall: 0.8892035386510222 S: 49.723743360925866, RU: 3.6601926947566885, MI: 49.5

#### Domain + PPI

In [31]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont cc --model domain_ppi

Computing Fmax
AVG IC 11741.146
Fscore: 0.008581387728563953, Precision: 0.0043091832504145965, Recall: 1.0 S: 11727.5560872697, RU: 0.0, MI: 11727.5560872697 threshold: 0.0, WFmax: 0.0023122248383630546
AVG IC 222.985
Fscore: 0.18280566759066974, Precision: 0.10097033769272605, Recall: 0.9646168000938674 S: 210.9808038715656, RU: 1.5797439122162513, MI: 210.97488952826552 threshold: 0.01, WFmax: 0.10868716175237698
AVG IC 129.875
Fscore: 0.25340848010275024, Precision: 0.14626016847250353, Recall: 0.947629951236309 S: 118.36816211131841, RU: 2.064600057762615, MI: 118.35015516767538 threshold: 0.02, WFmax: 0.16700638274483318
AVG IC 94.199
Fscore: 0.3053342936981997, Precision: 0.18257384093549062, Recall: 0.9319996153498806 S: 83.13224731231217, RU: 2.4854044322622872, MI: 83.09508594377604 threshold: 0.03, WFmax: 0.21151716251779154
AVG IC 73.435
Fscore: 0.34672531812276053, Precision: 0.21379503612493228, Recall: 0.9166929735168535 S: 62.85032966352301, RU: 2.936138302875986, MI: 6

#### Domain + BLAST + PPI

In [32]:
! python3 ../DomainPFP/evaluate.py --data-root ../data/netgo_benchmark --ont cc --model domain_blast_ppi

Computing Fmax
AVG IC 11741.146
Fscore: 0.008581387728563953, Precision: 0.0043091832504145965, Recall: 1.0 S: 11727.556087269732, RU: 0.0, MI: 11727.556087269732 threshold: 0.0, WFmax: 0.0023122248383630533
AVG IC 193.949
Fscore: 0.20530274664731432, Precision: 0.11487768930280717, Recall: 0.9645057915476323 S: 181.95041824804116, RU: 1.5847378412260051, MI: 181.94351680291248 threshold: 0.01, WFmax: 0.1283630643959075
AVG IC 114.731
Fscore: 0.2828939550897675, Precision: 0.16631968705171776, Recall: 0.9458325010072823 S: 103.17630604283342, RU: 2.015102358803309, MI: 103.15662601659658 threshold: 0.02, WFmax: 0.19411461148624354
AVG IC 84.651
Fscore: 0.3359490995880837, Precision: 0.20499719797860844, Recall: 0.9300877632478827 S: 73.60594098493105, RU: 2.5022459973238225, MI: 73.56339655865565 threshold: 0.03, WFmax: 0.24129197819789966
AVG IC 68.490
Fscore: 0.37662102004223935, Precision: 0.2367134771897743, Recall: 0.9209277493910724 S: 57.74931962594801, RU: 2.7824060425594968, M