In [126]:
import sys
import imp
sys.path.insert(0, '../src')

In [127]:
from denoise.graph import io as graphio
imp.reload(graphio)

_, proteins_to_go = graphio.parse_go_label_file("../data/dream_files/human.golabels")
edgelist, node_list, node_map = graphio.parse_graph_file("../data/dream_files/dream_3.txt")

In [128]:
from denoise.graph import operations
from denoise.algorithms.dsd import computations
imp.reload(operations)
imp.reload(computations)

A = operations.densify(edgelist)
D = computations.compute_degree_mat(A)
X = computations.compute_X_normalized(A, D)

In [130]:
import denoise.algorithms.dsd.denoise as denoise
imp.reload(denoise)

# Useful for adding edges as well (though more computationally
# expensive
all_edges = denoise.predict_links(X) 

# Rank the edges in the edgelist according to the embedding
# and remove bottom 10 percent of edges
ranked_edges = denoise.rank_edges(edgelist, X)
bad_edges = ranked_edges[-int(len(ranked_edges) * 0.5):]

In [131]:
A_removed_edges = A.copy()
for (u, v, _, _) in bad_edges:
    A_removed_edges[u, v] = 0

A_added_edges = A.copy()
num_added, counter = 0, 0
while num_added < len(edgelist) * 0.10:
    (u, v), _ = all_edges[counter]
    if A_added_edges[u, v] == 0:
        A_added_edges[u, v] = 1
        num_added += 1
    counter += 1

In [132]:
D_added_edges = computations.compute_degree_mat(A_added_edges)
X_added_edges = computations.compute_X_normalized(A_added_edges, D_added_edges)
D_removed_edges = computations.compute_degree_mat(A_removed_edges)
X_removed_edges = computations.compute_X_normalized(A_removed_edges, D_removed_edges)

In [133]:
from denoise.algorithms.dsd import computations
imp.reload(computations)

S = computations.compute_sim_matrix(X, params={"rbf": 1})
S_removed_edges = computations.compute_sim_matrix(X_removed_edges, params={"rbf": 1})
S_added_edges = computations.compute_sim_matrix(X_added_edges, params={"rbf": 1})

In [135]:
from denoise import scoring
from denoise import predict
imp.reload(predict)
imp.reload(scoring)

n, _ = A.shape
labels = {i: proteins_to_go[node_list[i]] for i in range(n)
          if node_list[i] in proteins_to_go}

def create_predictor(similarity_matrix):
    def predictor(training_labels):
        tlabels_f = lambda i: (training_labels[i] if i in training_labels else [])
        return predict.wmv(similarity_matrix, tlabels_f)
    return predictor
 
A_scores = scoring.kfoldcv(5, labels, create_predictor(A))
A_added_edges_scores = scoring.kfoldcv(5, labels, create_predictor(A_added_edges))
A_removed_edges_scores = scoring.kfoldcv(5, labels, create_predictor(A_removed_edges))

S_scores = scoring.kfoldcv(5, labels, create_predictor(S))
S_removed_edges_scores = scoring.kfoldcv(5, labels, create_predictor(S_removed_edges))
S_added_edges_scores = scoring.kfoldcv(5, labels, create_predictor(S_added_edges))

print(f"The scores for running 5-fold cv using WMV on the original network are: {A_scores}")
print(f"The scores for running 5-fold cv using WMV on the original network with edges removed are: {A_removed_edges_scores}")
print(f"The scores for running 5-fold cv using WMV on the original network with edges added are: {A_added_edges_scores}")

print(f"The scores for running 5-fold cv using WMV on the embedded network are: {S_scores}")
print(f"The scores for running 5-fold cv using WMV on the embedded network with edges removed are: {S_removed_edges_scores}")
print(f"The scores for running 5-fold cv using WMV on the embedded network with added edges are: {S_added_edges_scores}")

The scores for running 5-fold cv using WMV on the original network are: [0.13961407491486946, 0.1373439273552781, 0.11350737797956867, 0.1021566401816118, 0.09761634506242906]
The scores for running 5-fold cv using WMV on the original network with edges removed are: [0.12372304199772985, 0.11804767309875142, 0.094211123723042, 0.056753688989784334, 0.015891032917139614]
The scores for running 5-fold cv using WMV on the original network with edges added are: [0.13961407491486946, 0.1373439273552781, 0.11350737797956867, 0.1021566401816118, 0.09761634506242906]
The scores for running 5-fold cv using WMV on the embedded network are: [0.19182746878547105, 0.1532349602724177, 0.11464245175936436, 0.10556186152099886, 0.10102156640181612]
The scores for running 5-fold cv using WMV on the embedded network with edges removed are: [0.1872871736662883, 0.14869466515323496, 0.11237230419977298, 0.094211123723042, 0.06242905788876277]
The scores for running 5-fold cv using WMV on the embedded netw

In [137]:
from denoise import scoring
from denoise import predict
import scipy.spatial.distance as spatial
imp.reload(predict)
imp.reload(scoring)

def create_predictor(X):
    distances = spatial.squareform(spatial.pdist(X))
    def predictor(training_labels):
        tlabels_f = lambda i: (training_labels[i] if i in training_labels else [])
        return predict.knn(distances, tlabels_f, 10)
    return predictor

X_scores = scoring.kfoldcv(5, labels, create_predictor(X))
X_added_edges_scores = scoring.kfoldcv(5, labels, create_predictor(X_added_edges))
X_removed_edges_scores = scoring.kfoldcv(5, labels, create_predictor(X_removed_edges))

In [138]:
print(np.mean(A_scores) * 100)
print(np.mean(A_added_edges_scores) * 100)
print(np.mean(A_removed_edges_scores) * 100)
print(np.mean(S_scores) * 100)
print(np.mean(S_removed_edges_scores) * 100)
print(np.mean(S_added_edges_scores) * 100)
print(np.mean(X_scores) * 100)
print(np.mean(X_removed_edges_scores) * 100)
print(np.mean(X_added_edges_scores) * 100)

11.804767309875142
11.804767309875142
8.172531214528945
13.325766174801362
12.099886492622018
13.325766174801362
12.553916004540294
6.855845629965948
12.57661748013621


In [124]:
from denoise import predict
imp.reload(predict)

X = np.array([[0, 5],
              [0, 2],
              [0, 2.5],
              [0, 4]])
labels_f = lambda x: "A" if x == 2 else []
print(predict.knn(X, labels_f, 1))

{0: '????', 1: 'A', 2: 'A', 3: '????'}
