In [1]:
import sys
import imp
sys.path.insert(0, '../src')

In [66]:
from denoise.graph import io as graphio
imp.reload(graphio)

_, proteins_to_go = graphio.parse_go_label_file("../data/dream_files/human.golabels")
edgelist, node_list, node_map = graphio.parse_graph_file("../data/dream_files/dream_3.txt")

In [57]:
from denoise.graph import operations
from denoise.algorithms.dsd import computations

A = operations.densify(edgelist)
D = computations.compute_degree_mat(A)
X = computations.compute_X_normalized(A, D)

In [69]:
import denoise.algorithms.dsd.denoise as denoise
imp.reload(denoise)

# Rank the edges in the edgelist according to the embedding
# and remove bottom 10 percent of edges
ranked_edges = denoise.rank_edges(edgelist, X)
bad_edges = ranked_edges[-int(len(ranked_edges) * 0.10):]

A_denoised = A.copy()
for (u, v, _, _) in bad_edges:
    A_denoised[u, v] = 0
    
D_denoised = computations.compute_degree_mat(A_denoised)
X_denoised = computations.compute_X_normalized(A_denoised, D_denoised)

In [76]:
from denoise.algorithms.dsd import computations
imp.reload(computations)

S = computations.compute_sim_matrix(X, params={"rbf": 2})
S_denoised = computations.compute_sim_matrix(X_denoised, params={"rbf": 2})

In [81]:
from denoise import scoring
from denoise import predict
imp.reload(predict)
imp.reload(scoring)

n, _ = A.shape
labels = {i: proteins_to_go[node_list[i]] for i in range(n)
          if node_list[i] in proteins_to_go}

def create_predictor(similarity_matrix):
    def predictor(training_labels):
        tlabels_f = lambda i: (training_labels[i] if i in training_labels else [])
        return predict.wmv(similarity_matrix, tlabels_f)
    return predictor
 
A_scores = scoring.kfoldcv(10, labels, create_predictor(A))
A_denoised_scores = scoring.kfoldcv(10, labels, create_predictor(A_denoised))
S_scores = scoring.kfoldcv(10, labels, create_predictor(S))
S_denoised_scores = scoring.kfoldcv(10, labels, create_predictor(S_denoised))

print(f"The scores for running 10-fold cv using WMV on the original network are: {A_scores}")
print(f"The scores for running 10-fold cv using WMV on the denoised original network are: {A_denoised_scores}")
print(f"The scores for running 10-fold cv using WMV on the embedded network are: {S_scores}")
print(f"The scores for running 10-fold cv using WMV on the denoised embedded network are: {S_denoised_scores}")

The scores for running 10-fold cv using WMV on the original network are: [0.16818181818181818, 0.18181818181818182, 0.16363636363636364, 0.125, 0.12954545454545455, 0.10227272727272728, 0.1, 0.11136363636363636, 0.12272727272727273, 0.0898876404494382]
The scores for running 10-fold cv using WMV on the denoised original network are: [0.16818181818181818, 0.175, 0.16363636363636364, 0.12045454545454545, 0.11818181818181818, 0.1, 0.09772727272727273, 0.08863636363636364, 0.07727272727272727, 0.04943820224719101]
The scores for running 10-fold cv using WMV on the embedded network are: [0.20227272727272727, 0.17954545454545454, 0.15227272727272728, 0.15454545454545454, 0.1318181818181818, 0.09772727272727273, 0.12045454545454545, 0.075, 0.11136363636363636, 0.0853932584269663]
The scores for running 10-fold cv using WMV on the denoised embedded network are: [0.20227272727272727, 0.17954545454545454, 0.15227272727272728, 0.15454545454545454, 0.12954545454545455, 0.09545454545454546, 0.12272