In [3]:
from convertdata import *
import numpy as np
import pandas as pd

# Run baseline model for Yeast and E.coli

In [4]:
organism = "ecoli"
path = './data/'+organism+'/'
path

'./data/ecoli/'

In [5]:
linkfile = path + "edgelist_biogrid.txt"

edgelist = pd.read_csv(linkfile, sep=" ", header=None)

neg_adj = pd.read_csv(path + "neg_sample.txt", header = None, sep=",")
neg_adj.columns = ['A', 'B', 'C']

In [6]:
edgelist.shape

(148340, 3)

In [7]:
neg_adj.shape

(20196270, 3)

In [8]:
def convertdata(path, edgelist, neg_adj, train_file, test_file, validation_file, test_size=0.2):
    remove_file(test_file)
    remove_file(train_file)
    remove_file(validation_file)

    edgelist = edgelist.sample(frac=1).reset_index(drop=True)
    print("Splitting edgelist to train and test")
    x_train, x_test = train_test_split(edgelist, test_size=test_size)

    print("Sampling equal number of negative links")
    neg_adj_0 = neg_adj
    n = x_test.shape[0]
    ind = random.sample(range(len(neg_adj_0)), n)
    neg_adj_0 = pd.DataFrame(np.random.permutation(neg_adj_0))
    X_test_0 = neg_adj_0.iloc[ind, :]

    
    print("Creating validation and test set")
    X_test = pd.DataFrame(np.vstack((x_test, X_test_0))).astype(int)
    X_test, X_validation = train_test_split(X_test, test_size=0.5)

    print("Writing to file")
    X_test.to_csv(test_file, header=None, sep=' ', index=False, mode='a')
    X_validation.to_csv(validation_file, header=None, sep=' ', index=False, mode='a')
    x_train.to_csv(train_file, header=None, sep=' ', index=False, mode='a')

In [9]:
for i in range(2, 8):
    np.random.seed(i)
    test_size = float(i)/10
    print(test_size)
    print(path)
    print("converting data for iteration " + str(i) + " with random seed " + str(i))
    trainlinkfile = path + "data_ref/edgelist_train_"+str(test_size)+".txt"
    testlinkfile = path + "data_ref/edgelist_test_"+str(test_size)+".txt"
    vallinkfile = path + "data_ref/edgelist_val_"+str(test_size)+".txt"
    convertdata(path, edgelist, neg_adj, trainlinkfile, testlinkfile, vallinkfile, test_size=test_size)

0.2
./data/ecoli/
converting data for iteration 2 with random seed 2
Splitting edgelist to train and test
Sampling equal number of negative links
Creating validation and test set
Writing to file
0.3
./data/ecoli/
converting data for iteration 3 with random seed 3
Splitting edgelist to train and test
Sampling equal number of negative links
Creating validation and test set
Writing to file
0.4
./data/ecoli/
converting data for iteration 4 with random seed 4
Splitting edgelist to train and test
Sampling equal number of negative links
Creating validation and test set
Writing to file
0.5
./data/ecoli/
converting data for iteration 5 with random seed 5
Splitting edgelist to train and test
Sampling equal number of negative links
Creating validation and test set
Writing to file
0.6
./data/ecoli/
converting data for iteration 6 with random seed 6
Splitting edgelist to train and test
Sampling equal number of negative links
Creating validation and test set
Writing to file
0.7
./data/ecoli/
convert

In [3]:
import pickle
from evaluation import *
from numba import jit

@jit
def read_test_link(testlinkfile):
    X_test = []
    f = open(testlinkfile)
    line = f.readline()
    while line:
        line = line.strip().split(" ")
        X_test.append([int(line[0]), int(line[1]), int(line[2])])
        line = f.readline()
    f.close()
#     print("test link number:", len(X_test))
    return X_test


In [4]:
from joblib import Parallel, delayed
import multiprocessing
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
def evaluate_ROC(X_test, cosine_matrix):
    y_true = [ X_test[i][2] for i in range(len(X_test))]
    y_predict = [cosine_matrix[X_test[i][0], X_test[i][1]]  for i in range(len(X_test))]
    roc = roc_auc_score(y_true, y_predict)
    if roc < 0.5:
        roc = 1 - roc
    pr = average_precision_score(y_true, y_predict)
    return roc, pr

# LINE

In [10]:
for i in range(2, 8):
    np.random.seed(i)
    test_size = float(i)/10
    print("Evaluating for iteration " +str(i + 1))
    test_data = read_test_link(path + "data_ref/edgelist_test_0.7.txt")
    print(len(test_data))
    embeddings_file = "output/ecoli/line_embeddings/final_emb_0.2.txt"
    emb = load_embedding(embeddings_file, 4511, combineAttribute=False, datafile=path+"data_standard.txt")
    cosine_matrix = cosine_similarity(emb, emb)
    roc, pr = evaluate_ROC_from_cosine_matrix(test_data, cosine_matrix)
    print("LINE Method without attributes: Accuracy (ROC) in Test Data set", "{0:.9f} {1:.9f}".format(roc, pr))

    emb = load_embedding(embeddings_file, 4511, combineAttribute=True, datafile=path+"data_standard.txt")
    cosine_matrix = cosine_similarity(emb, emb)
    roc, pr = evaluate_ROC_from_cosine_matrix(test_data, cosine_matrix)
    print("LINE Method with Attributes: Accuracy (ROC) in Test Data set", "{0:.9f} {1:.9f}".format(roc, pr))

Evaluating for iteration 3
103838
LINE Method without attributes: Accuracy (ROC) in Test Data set 0.754263930 0.714346920
LINE Method with Attributes: Accuracy (ROC) in Test Data set 0.539857209 0.565137008
Evaluating for iteration 4
103838
LINE Method without attributes: Accuracy (ROC) in Test Data set 0.754333475 0.714368608
LINE Method with Attributes: Accuracy (ROC) in Test Data set 0.539874529 0.565077692
Evaluating for iteration 5
103838
LINE Method without attributes: Accuracy (ROC) in Test Data set 0.754105350 0.714290780
LINE Method with Attributes: Accuracy (ROC) in Test Data set 0.539844874 0.565093947
Evaluating for iteration 6
103838
LINE Method without attributes: Accuracy (ROC) in Test Data set 0.754356197 0.714381633
LINE Method with Attributes: Accuracy (ROC) in Test Data set 0.539911506 0.565163435
Evaluating for iteration 7
103838
LINE Method without attributes: Accuracy (ROC) in Test Data set 0.754278215 0.714355791
LINE Method with Attributes: Accuracy (ROC) in Tes

# Node2Vec

In [1]:
organism = "ecoli"
path = './data/'+organism+'/'

In [6]:
i = 0
print("Evaluating for iteration " +str(i + 1))
test_data = read_test_link(path + "data_ref/edgelist_test_"+str(i)+".txt")
len(test_data)

Evaluating for iteration 1


29668

In [12]:
embeddings_file = "output/ecoli/deepWalk_embeddings/final_emb_0.2.txt"
emb = load_embedding(embeddings_file, 4511, combineAttribute=False, datafile=path+"data_standard.txt")
emb.shape

(4511, 128)

In [13]:
cosine_matrix = cosine_similarity(emb, emb)
roc, pr = evaluate_ROC(test_data, cosine_matrix)
print("Node2vec Method without attributes: Accuracy (ROC) in Test Data set", "{0:.9f} {1:.9f}".format(roc, pr))

emb = load_embedding(embeddings_file, 4511, combineAttribute=True, datafile=path+"data_standard.txt")
cosine_matrix = cosine_similarity(emb, emb)
roc, pr = evaluate_ROC(test_data, cosine_matrix)
print("Node2vec Method with Attributes: Accuracy (ROC) in Test Data set", "{0:.9f} {1:.9f}".format(roc, pr))

Node2vec Method without attributes: Accuracy (ROC) in Test Data set 0.774772212 0.750232074
Node2vec Method with Attributes: Accuracy (ROC) in Test Data set 0.539318339 0.567778448


In [12]:
import networkx as nx
from node2vec import Node2Vec
from bokeh.models.graphs import from_networkx

In [4]:
# Create a graph
graph = nx.read_edgelist(path + "data_ref/edgelist_train_0.txt", nodetype=int, data=(('weight',float),))

In [17]:
from bokeh.plotting import figure
from bokeh.io import show, output_file

In [10]:
plot = figure(title="Networkx Integration Demonstration", x_range=(-1.1,1.1), y_range=(-1.1,1.1),
              tools="", toolbar_location=None)

In [13]:
G = from_networkx(graph, nx.spring_layout, scale=2, center=(0,0))

In [15]:
plot.renderers.append(G)

In [18]:
show(plot)