In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import scipy
from scipy.spatial.distance import pdist, squareform
import sklearn
import matplotlib.pyplot as plt
from collections import Counter
import tqdm
import requests
import time

In [None]:
def load_network_sparse(net_file,ngene):
    ppi_df = pd.read_csv(net_file,header=None,sep='\t')
    A = np.zeros((ngene,ngene))
    row_idx = ppi_df.iloc[:,0].values -1 
    col_idx = ppi_df.iloc[:,1].values -1
    A[row_idx, col_idx] = ppi_df.iloc[:,2].values
    assert (A == A.T).all()
    zero_rows = np.all(A == 0, axis=1)
    diag_indices = np.arange(ngene)
    A[diag_indices[zero_rows], diag_indices[zero_rows]] = 1
    return A

In [None]:
def load_all_nets(ppi_files,n_gene):
    '''
    parameters:
    - ppi_files: [str, str, ...], list of network file paths, each file should contain three columns: [protein1, protein2, score]
    - ref_gene_file: str, file path, the file contains all genes, one gene per line
    output:
    - nets: n_file x n_gene x n_gene array with ppi networks
    '''
    n_file = len(ppi_files)
    nets = np.zeros((n_file,n_gene,n_gene))
    for i in range(n_file):
        A = load_network_sparse(ppi_files[i],n_gene)
        nets[i,:,:] = A
    return nets

In [None]:
def compute_rwr_original_sparse(ppi_files,restart_prob,ngene,nets):
    ''' 
    - ppi_files: list of network file paths
    - restart_prob: RWR restart probability
    - ngene: number of genes
    output:
    - walks: for the i-th RWR result, [i,:,:], each column is the stationary distribution of a node
    '''
    n_file = len(ppi_files)
    e = np.ones(ngene)
    I = np.eye(ngene)
    walks = np.zeros((n_file,ngene,ngene))
    for i in range(n_file):
        A = nets[i,:,:]
        d = A @ e
        P = A / d # transition matrix
        W = (I - (1 - restart_prob) * P)
        W = np.linalg.inv(W)
        W = W * restart_prob 
        walks[i,:,:] = W
    return walks

In [None]:
def svd_embed_sparse_func(walks, ngene, embed_dim):
    n_net = walks.shape[0]
    mat = np.zeros((ngene,ngene))
    W_updated = np.zeros_like(walks)
    for i in range(n_net):
        W = walks[i,:,:]
        W[W<=1e-8] = 0
        W = np.log(W, where = W > 1e-8)
        W_updated[i,:,:] = W
        tmp = W.T @ W
        mat = mat + tmp
    eigenvalues, eigenvectors = scipy.sparse.linalg.eigs(mat,k=embed_dim)
    x = np.diag(np.sqrt(np.sqrt(eigenvalues))) @ eigenvectors.T
    return np.real(x)

In [None]:
def load_train_test_anno(rand,fold,org,ont_type,ont_size1,ont_size2):
    '''
    predifined fold splits
    - rand: 1 2 3 4 5
    - fold: 1 2 3 4 5
    - org: "Ecoli" or "yeast"
    '''
    file_name = 'data/train_test_split/'+org+'/rand' + str(rand) +'/fold' + str(fold) + '_' + ont_type+ '_' +  str(ont_size1)+ '_' +  str(ont_size2)+ '_train_anno.txt'
    train = pd.read_csv(file_name,header=None,sep = '\t')
    file_name = 'data/train_test_split/'+org+'/rand' + str(rand) +'/fold' + str(fold) + '_' + ont_type+ '_' +  str(ont_size1)+ '_' +  str(ont_size2)+ '_test_anno.txt'
    test = pd.read_csv(file_name,header=None,sep = '\t')
    return train.to_numpy(), test.to_numpy()

In [None]:
def augment_graph(nets, ngene, gene_clusters, mustlink_weight, cannotlink_weight):
    '''
    - nets: original adjacency matrices directly read from PPI files
    - gene_clusters: (num_clusers, num_genes), binary matrix indicating which gene belongs to which clusters
    '''
    n_nets = nets.shape[0]
    n_clusters = gene_clusters.shape[0]
    augmented = np.zeros((n_nets,(ngene+n_clusters),(ngene+n_clusters)))
    for i in range(n_nets):
        A = nets[i,:,:]
        A_block = np.block([[A,mustlink_weight*gene_clusters.T],[mustlink_weight*gene_clusters,cannotlink_weight*np.ones((n_clusters,n_clusters))]])
        np.fill_diagonal(A_block,0)
        zero_rows = np.all(np.absolute(A_block) == 0, axis=1)
        diag_indices = np.arange(ngene+n_clusters)
        A_block[diag_indices[zero_rows], diag_indices[zero_rows]] = 1
        augmented[i,:,:] = A_block
    return augmented

In [None]:
def augmented_RWR(augmented_nets, restart_prob):
    '''
    RWR for augmented graph which contains negative edge weights
    '''
    n_nets = augmented_nets.shape[0]
    n_nodes = augmented_nets.shape[1]
    augmented_walks = np.zeros((n_nets,n_nodes,n_nodes))
    e = np.ones(n_nodes)
    for i in range(n_nets):
        A = augmented_nets[i,:,:]
        d = np.absolute(A) @ e
        L = np.diag(d) - (1-restart_prob)*A
        L_inv = np.linalg.inv(L)
        W = restart_prob*(np.diag(d) @ L_inv)
        augmented_walks[i,:,:] = W
    return augmented_walks


In [None]:
def augmented_SVD_with_cannolink(aug_walks, embed_dim):
    n_net = aug_walks.shape[0]
    n_node = aug_walks.shape[1]
    mat = np.zeros((n_node,n_node))
    W_updated = np.zeros_like(aug_walks)
    for i in range(n_net):
        W = aug_walks[i,:,:]
        min_entry = W.min()
        if min_entry > 0:
            min_entry = 0.0
        W = W - min_entry
        W[W<=1e-8] = 0
        W = np.log(W, where = W > 1e-8)
        W_updated[i,:,:] = W
        tmp = W.T @ W
        mat = mat + tmp
    eigenvalues, eigenvectors = scipy.sparse.linalg.eigs(mat,k=embed_dim)
    x = np.diag(np.sqrt(np.sqrt(eigenvalues))) @ eigenvectors.T
    return np.real(x)

In [None]:
def get_knn_ind(embed,train_anno):
    '''
    parameters:
    - embed: (dim, num_gene), protein embeddings
    - train_anno: annotations for training proteins
    output:
    dist_mat: n_gene x n_gene
    sorted_ind: ngene x (ngene-1), top n labels
    '''
    n_gene = train_anno.shape[1]
    train_idx = np.where(sum(train_anno)>0)[0]
    # embed = embed[:n_gene,:n_gene]
    dist_mat = squareform(pdist(embed.T))
    dist_mat = dist_mat[:n_gene,:n_gene] # symmetrical
    np.fill_diagonal(dist_mat, 1e8)
    # mask = np.ones(n_gene, dtype=bool)
    # mask[train_idx] = False
    # dist_mat[:, mask] = 1e8

    sorted_ind = np.argsort(dist_mat, axis=1)
    # sorted_ind = sorted_ind[:,1:] # each row, gene's nearest neighbors
    
    return dist_mat, sorted_ind


In [None]:
def majority_vote(dist_mat, knn_mat, train_anno, test_anno,k, weighted=True):
    '''
    parameters:
    - dist_mat: n_gene x n_gene
    - knn_mat: ngene x (ngene-1), sorted labels
    - train_anno: n_label x n_gene
    - test_anno: n_label x n_gene
    - k: number of nearest neighbors
    - weighted: boolean, whether doing weighted majority vote or not
    output:
    - final_scores: n_label x n_test, normalized scores of each label
    - num_voters: vector of numbers of voting nodes
    '''
    train_idx = np.where(sum(train_anno)>0)[0]
    test_idx = np.where(sum(test_anno)>0)[0]
    final_scores = np.zeros((train_anno.shape[0],len(test_idx)))
    num_voters = []
    updated_voters = []
    c = 0
    for index, i in enumerate(test_idx):
        nn = knn_mat[i,:k]
        nn_labeled = nn[np.isin(nn, train_idx)] 
        
        if len(nn_labeled) == 0: # if within the first k neighbors, no neighbor is labeled, then use the nearest neighbor with label
            voting_node = knn_mat[i,:][np.isin(knn_mat[i,:], train_idx)][0]
            scores = np.array(train_anno[:,voting_node])
            scores = scores / sum(scores)
            num_voters.append(len(nn_labeled))
            tmp = [voting_node]
            updated_voters.append(tmp)
        else:
            votes = np.array(train_anno[:,nn_labeled])
            if weighted:
                d = dist_mat[i,nn_labeled]
                d = d[np.nonzero(d)]
                votes = np.array(train_anno[:,nn_labeled[np.nonzero(d)]])
                tmp = nn_labeled[np.nonzero(d)]
                updated_voters.append(tmp)
                num_voters.append(len(d))
                if len(d) == 0:
                    c += 1
                    voting_node = np.random.choice(train_idx)
                    scores = np.array(train_anno[:,voting_node])
                    scores = scores / sum(scores)
                else:
                    weights = 1 / d
                    scores = votes @ weights.T
                    scores = scores / sum(scores)
            else:
                num_voters.append(len(nn_labeled))
                updated_voters.append(nn_labeled)
                scores = np.sum(votes,axis=1)
                scores = scores / sum(scores)
        
        final_scores[:,index] = np.squeeze(scores)
    print(c)
    return final_scores, num_voters,updated_voters


In [None]:
def acc_top1_pred(test_scores, test_anno):
    '''
    for each test gene, find the label with the highest predicted score, use it as the predicted label
    accuracy is defined as (#predicted label in test true labels) / (#test genes)
    problems: if there's a tie, the one with smaller index will be used
    parameters:
    - test_scores: n_label x n_test
    - test_anno: n_label x n_gene
    output:
    - acc: accuracy score
    '''
    test_idx = np.where(sum(test_anno)>0)[0]
    zero_idx = np.where(np.sum(test_scores,axis=0)==0)[0]
    mask = np.ones(len(test_idx), dtype=bool)
    mask[zero_idx] = False
    test_anno = test_anno[:,test_idx] # n_label x n_test
    sorted_index = np.argsort(-1*test_scores,axis=0) # n_label x n_test, with row 0 the highest predicted label for each gene
    true_pred = test_anno[sorted_index[0,:], np.arange(test_anno.shape[1])]
    true_pred = true_pred[mask]
    print(len(zero_idx))
    acc = np.mean(true_pred)
    return acc,true_pred
    

In [None]:
def f1_auprc_pred(test_scores, test_anno,top_n):
    '''
    for each test gene, find the labels with the top_n highest predicted scores, use them as the predicted labels
    f1 is defined as 2*TP / 2*TP + FP + FN
    probelms: if there's a tie, the one with smaller index will be used, only top n predictions will be considered, it will increase the number of FN
    parameters:
    - test_scores: n_label x n_test
    - test_anno: n_label x n_gene
    - top_n: int, the number of labels to be predicted
    output:
    - acc: accuracy score
    '''
    test_idx = np.where(sum(test_anno)>0)[0]
    zero_idx = np.where(np.sum(test_scores,axis=0)==0)[0]

    mask = np.ones(len(test_idx), dtype=bool)
    mask[zero_idx] = False
    test_anno = test_anno[:,test_idx] # n_label x n_test
    
    test_anno = test_anno[:,mask]
    test_scores = test_scores[:,mask]
    sorted_index = np.argsort(-1*test_scores,axis=0) # n_label x n_test, with row 0 the highest predicted label for each gene
    top_ind = sorted_index[:top_n,:].flatten()
    pred = np.zeros_like(test_anno)
    cols = np.tile(np.arange(test_anno.shape[1]), top_n)
    pred[top_ind, cols] = 1
    f1 = sklearn.metrics.f1_score(test_anno.flatten(),pred.flatten())
    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(test_anno.flatten(), pred.flatten())
    auprc = sklearn.metrics.auc(recall, precision)
    return f1, auprc
    


In [None]:
'''
for each augmented node, randomly choose a fixed number of nodes to connect to
the number of genes that each augmented node connects to are the same except for the last one
'''
def random_split_vector(train_anno,n_gene, num_sub_vectors,seed=None):
    if seed is not None:
        np.random.seed(seed)
        
    input_vector = np.where(sum(train_anno)>0)[0]
    
    if num_sub_vectors <= 0 or num_sub_vectors > len(input_vector):
        raise ValueError("Invalid number of sub-vectors")
    
    shuffled_vector = np.random.permutation(input_vector)
    sub_vector_size = len(shuffled_vector) // num_sub_vectors
    
    group_matrix = np.zeros((num_sub_vectors, len(input_vector)), dtype=int)
    res_matrix = np.zeros((num_sub_vectors, n_gene), dtype=int)
    
    start_index = 0
    for i in range(num_sub_vectors):
        end_index = start_index + sub_vector_size
        
        if i == num_sub_vectors - 1:
            end_index = len(shuffled_vector)
        
        selected_indices = shuffled_vector[start_index:end_index]
        
        group_matrix[i, np.isin(shuffled_vector, selected_indices)] = 1
        # res_matrix[i, selected_indices] = 1
        
        start_index = end_index
    
    res_matrix[:,shuffled_vector] = group_matrix
    
    return res_matrix

In [None]:
def run_pipeline(ppi_files,n_gene,method=None,restart_prob=None,embed_dim=None,rand=None,org=None,n_fold=None,k=None,ont_type=None,ont_size1=None,ont_size2=None,n_cluster=None):
    ''' 
    parameters:
    - ppi_files: list of str, list of file paths to ppi networks
    - n_gene: int, number of genes
    - method: list of str, one or more of Mashup, REPEL
    - restart_prob: float, RWR restart probability
    - embed_dim: int, number of dimension
    - rand: int, random split
    - org: str, "yeast" or "Ecoli" 
    - n_fold: int, total number of folds
    - k: int, number of nearest neighbors to be considered
    - ont_type: str, bp or mf or cc
    - ont_size1: int, 11, 31, 101
    - ont_size2: int, 30, 100, 300
    - n_cluster: int, number of random augmented nodes
    output:
    - performance_dict: a dictionary contains list of performances for all methods
    '''

    performance_dict = {}

    for m in method:
        m_acc = m + "_acc"
        m_f1 = m + "_f1"
        m_auprc = m + "_auprc"
        performance_dict[m_acc] = []
        performance_dict[m_f1] = []
        performance_dict[m_auprc] = []
        for i in range(n_fold):
            print("fold: ", i+1)
            train_anno, test_anno = load_train_test_anno(rand,i+1,org,ont_type,ont_size1,ont_size2)
            if m == "Mashup":
                print("Mashup")
                nets = load_all_nets(ppi_files,n_gene)
                walks = compute_rwr_original_sparse(ppi_files,restart_prob,n_gene,nets)
                x = svd_embed_sparse_func(walks, n_gene, embed_dim)
                dist_mat, knn = get_knn_ind(x,train_anno)
                scores, _, _ = majority_vote(dist_mat, knn, train_anno, test_anno,k)
                acc,_ = acc_top1_pred(scores, test_anno)
                f1, auprc = f1_auprc_pred(scores, test_anno,3)
                performance_dict[m_acc].append(acc)
                performance_dict[m_f1].append(f1)
                performance_dict[m_auprc].append(auprc)
            elif m == "REPEL":
                print("REPEL")
                nets = load_all_nets(ppi_files,n_gene)
                rand_cluster = random_split_vector(train_anno,n_gene, n_cluster,seed=None)
                rand_graph = augment_graph(nets, n_gene, rand_cluster, 1, -1)
                rand_rwr_res = augmented_RWR(rand_graph, restart_prob)
                mat_rand_x = augmented_SVD_with_cannolink(rand_rwr_res, embed_dim)
                dist_mat, knn = get_knn_ind(mat_rand_x,train_anno)
                scores, _,_ = majority_vote(dist_mat, knn, train_anno, test_anno,k, weighted=True)
                acc,_ = acc_top1_pred(scores, test_anno)
                f1, auprc = f1_auprc_pred(scores, test_anno,3)
                performance_dict[m_acc].append(acc)
                performance_dict[m_f1].append(f1)
                performance_dict[m_auprc].append(auprc)

            else:
                print("Haven't implemented yet")
                return
    return performance_dict


In [None]:
def write_log(performance_dict,rand,org,ont_type,ont_size1,ont_size2,save_path=None):
    with open(save_path,"a") as f:
        tmp = org + " " + "rand " + str(rand) + " " + ont_type + " " + str(ont_size1) + " " + str(ont_size2) 
        f.write(tmp)
        f.write("\n")
        for k, v in performance_dict.items():
            f.write(k)
            f.write(" ")
            for i in v:
                f.write(f"{i:.4f}")
                f.write(" ")
            f.write("\n")

In [None]:
# for yeast
string_nets = ['neighborhood', 'fusion', 'cooccurence', 'coexpression', 'experimental', 'database']
ppi_files = []
for net in string_nets:
    tmp = 'data/networks/yeast/yeast_string_'+net+'_adjacency.txt'
    ppi_files.append(tmp)

all_genes = pd.read_csv("data/annotations/yeast/go_yeast_ref_genes.txt",header=None)
all_genes = list(all_genes.iloc[:,0].values)
n_gene = len(all_genes)
print(ppi_files)
print(n_gene)

In [None]:
method=['REPEL','Mashup']
restart_prob=0.5
embed_dim=400
org="yeast"
n_fold=5
k=10
ont_type_list=["bp","mf","cc"]
ont_size1_list=[11,31,101]
ont_size2_list=[30,100,300]
n_cluster=15
for rand in range(1,6):
    print("rand: ",rand)
    for ont_type in ont_type_list:
        print("ont_type")
        for c in range(3):
            ont_size1 = ont_size1_list[c]
            ont_size2 = ont_size2_list[c]
            performance_dict = run_pipeline(ppi_files,n_gene,method=method,restart_prob=restart_prob,embed_dim=embed_dim,rand=rand,org=org,n_fold=n_fold,k=k,ont_type=ont_type,ont_size1=ont_size1,ont_size2=ont_size2,n_cluster=n_cluster)
            write_log(performance_dict,rand,org,ont_type,ont_size1,ont_size2,save_path="yeast_result_log.txt")
            break
        break
    break


For E. coli

In [None]:
method=['MashUp','REPEL']
restart_prob=0.5
embed_dim=400
org="yeast"
n_fold=5
k=10
ont_type_list=["bp","mf","cc"]
ont_size1_list=[11,31,101]
ont_size2_list=[30,100,300]
n_cluster=20
for rand in range(1,6):
    print("rand: ",rand)
    for ont_type in ont_type_list:
        print("ont_type")
        for c in range(3):
            ont_size1 = ont_size1_list[c]
            ont_size2 = ont_size2_list[c]
            performance_dict = run_pipeline(ppi_files,n_gene,method=method,restart_prob=restart_prob,embed_dim=embed_dim,rand=rand,org=org,n_fold=n_fold,k=k,ont_type=ont_type,ont_size1=ont_size1,ont_size2=ont_size2,n_cluster=n_cluster)
            write_log(performance_dict,rand,org,ont_type,ont_size1,ont_size2,save_path="yeast_result_log.txt")



In [None]:
# for E.coli
string_nets = ['neighborhood', 'fusion', 'cooccurence', 'coexpression', 'experimental', 'database']
ppi_files = []
for net in string_nets:
    tmp = 'data/networks/Ecoli/ecoli_string_'+net+'_adjacency.txt'
    ppi_files.append(tmp)

all_genes = pd.read_csv("data/annotations/Ecoli/ecoli_ref_genes.txt",header=None,sep='\t')
all_genes = list(all_genes.iloc[:,1].values)
n_gene = len(all_genes)
print(ppi_files)
print(n_gene)


In [None]:
method=['Mashup','REPEL']
restart_prob=0.5
embed_dim=400
org="Ecoli"
n_fold=5
k=10
ont_type_list=["bp","mf","cc"]
ont_size1_list=[11,31,101]
ont_size2_list=[30,100,300]
n_cluster=10
for rand in range(1,6):
    print("rand: ",rand)
    for ont_type in ont_type_list:
        print("ont_type")
        for c in range(3):
            ont_size1 = ont_size1_list[c]
            ont_size2 = ont_size2_list[c]
            performance_dict = run_pipeline(ppi_files,n_gene,method=method,restart_prob=restart_prob,embed_dim=embed_dim,rand=rand,org=org,n_fold=n_fold,k=k,ont_type=ont_type,ont_size1=ont_size1,ont_size2=ont_size2,n_cluster=n_cluster)
            write_log(performance_dict,rand,org,ont_type,ont_size1,ont_size2,save_path="Ecoli_result_log_rerun.txt")



In [None]:
bionic_embedding_yeast = pd.read_csv("bionic/outputs/yeast_mu_features.tsv",sep="\t",header=0,index_col=0)

In [None]:
bionic_embedding_yeast

In [None]:
bionic_embedding_yeast.loc[3].to_numpy()

In [None]:
nets = load_all_nets(ppi_files,n_gene)
walks = compute_rwr_original_sparse(ppi_files,0.5,n_gene,nets)
mu_x = svd_embed_sparse_func(walks, n_gene, 512)

In [None]:
mu_x.shape

In [None]:
def compare_w_bionic(bionic_embed_df,mu_x,rand,org,ont_type,ont_size1,ont_size2,k):
    bionix_id = np.array(bionic_embed_df.index-1)
    merge_embed = mu_x.T
    perf = {}
    perf["bionic_acc"] = []
    perf["bionic_f1"] = []
    perf["bionic_auprc"] = []
    for ele in bionix_id:
        merge_embed[ele,:] = bionic_embed_df.loc[ele+1].to_numpy()

    for i in range(5):
        train_anno, test_anno = load_train_test_anno(rand,i+1,org,ont_type,ont_size1,ont_size2)
        dist_mat, knn = get_knn_ind(merge_embed.T,train_anno)
        scores, _, _ = majority_vote(dist_mat, knn, train_anno, test_anno,k)
        acc,_ = acc_top1_pred(scores, test_anno)
        f1, auprc = f1_auprc_pred(scores, test_anno,3)
        perf["bionic_acc"].append(acc)
        perf["bionic_f1"].append(f1)
        perf["bionic_auprc"].append(auprc)
    return perf


In [None]:
org="yeast"
k=10
ont_type_list=["bp","mf","cc"]
ont_size1_list=[11,31,101]
ont_size2_list=[30,100,300]
for rand in range(1,6):
    print("rand: ",rand)
    for ont_type in ont_type_list:
        print("ont_type")
        for c in range(3):
            ont_size1 = ont_size1_list[c]
            ont_size2 = ont_size2_list[c]
            perf = compare_w_bionic(bionic_embedding_yeast,mu_x,rand,org,ont_type,ont_size1,ont_size2,k)
            write_log(perf,rand,org,ont_type,ont_size1,ont_size2,save_path="Bionic_yeast.txt")


In [None]:
nets = load_all_nets(ppi_files,n_gene)
walks = compute_rwr_original_sparse(ppi_files,0.5,n_gene,nets)
mu_x = svd_embed_sparse_func(walks, n_gene, 512)
mu_x.shape

In [None]:
bionic_embedding_ecoli = pd.read_csv("bionic/outputs/ecoli_mu_features.tsv",sep="\t",header=0,index_col=0)
bionic_embedding_ecoli

In [None]:
org="Ecoli"
k=10
ont_type_list=["bp","mf","cc"]
ont_size1_list=[11,31,101]
ont_size2_list=[30,100,300]
for rand in range(1,6):
    print("rand: ",rand)
    for ont_type in ont_type_list:
        print("ont_type")
        for c in range(3):
            ont_size1 = ont_size1_list[c]
            ont_size2 = ont_size2_list[c]
            perf = compare_w_bionic(bionic_embedding_ecoli,mu_x,rand,org,ont_type,ont_size1,ont_size2,k)
            write_log(perf,rand,org,ont_type,ont_size1,ont_size2,save_path="Bionic_ecoli_rerun.txt")

In [None]:
deepNF_features = np.load('deepNF/ori_yeast_midmodel_features_dim600.npy') 
deepNF_features.shape

In [None]:
train_anno, test_anno = load_train_test_anno(1,1,"yeast","bp",11,30)
dist_mat, knn = get_knn_ind(deepNF_features.T,train_anno)
scores, _, _ = majority_vote(dist_mat, knn, train_anno, test_anno,10)
acc,_ = acc_top1_pred(scores, test_anno)
f1, auprc = f1_auprc_pred(scores, test_anno,3)

In [None]:
def compare_w_deepNF(deepNF_embed,rand,org,ont_type,ont_size1,ont_size2,k):
    perf = {}
    perf["deepNF_acc"] = []
    perf["deepNF_f1"] = []
    perf["deepNF_auprc"] = []
    for i in range(5):
        train_anno, test_anno = load_train_test_anno(rand,i+1,org,ont_type,ont_size1,ont_size2)
        dist_mat, knn = get_knn_ind(deepNF_embed.T,train_anno)
        scores, _, _ = majority_vote(dist_mat, knn, train_anno, test_anno,k)
        acc,_ = acc_top1_pred(scores, test_anno)
        f1, auprc = f1_auprc_pred(scores, test_anno,3)
        perf["deepNF_acc"].append(acc)
        perf["deepNF_f1"].append(f1)
        perf["deepNF_auprc"].append(auprc)
    return perf

In [None]:
org="yeast"
k=10
ont_type_list=["bp","mf","cc"]
ont_size1_list=[11,31,101]
ont_size2_list=[30,100,300]
for rand in range(1,6):
    print("rand: ",rand)
    for ont_type in ont_type_list:
        print("ont_type:", ont_type)
        for c in range(3):
            ont_size1 = ont_size1_list[c]
            ont_size2 = ont_size2_list[c]
            perf = compare_w_deepNF(deepNF_features,rand,org,ont_type,ont_size1,ont_size2,k)
            write_log(perf,rand,org,ont_type,ont_size1,ont_size2,save_path="DeepNF_yeast.txt")

In [None]:
deepNF_features_ecoli = np.load('deepNF/ecoli_midmodel_features_dim600.npy') 
deepNF_features_ecoli.shape

In [None]:
org="Ecoli"
k=10
ont_type_list=["bp","mf","cc"]
ont_size1_list=[11,31,101]
ont_size2_list=[30,100,300]
for rand in range(1,6):
    print("rand: ",rand)
    for ont_type in ont_type_list:
        print("ont_type")
        for c in range(3):
            ont_size1 = ont_size1_list[c]
            ont_size2 = ont_size2_list[c]
            perf = compare_w_deepNF(deepNF_features_ecoli,rand,org,ont_type,ont_size1,ont_size2,k)
            write_log(perf,rand,org,ont_type,ont_size1,ont_size2,save_path="DeepNF_ecoli_rerun.txt")

In [None]:
train_anno, test_anno = load_train_test_anno(1,1,"Ecoli","cc",11,30)

In [None]:
train_anno

In [None]:
test_anno