In [35]:
import pickle 
import LoadData as data
import numpy as np
from GNE import GNE
from evaluation import *
from utils import *
import pandas as pd
import scipy.sparse as sp
import random

In [None]:
def split_seq(seq, num_splits):
        newseq = []
        splitsize = 1.0/num_splits*len(seq)
        for i in range(num_splits):
                newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
        return newseq

In [43]:
path = './data/ecoli/'

In [44]:

geneids = pd.read_csv(path + "gene_ids.tsv", sep=" ")
num_genes = geneids.shape[0]
link_file = path + "edgelist_biogrid.txt"

adj = load_network(link_file, num_genes)

### Loading [./data/ecoli/edgelist_biogrid.txt]...


In [45]:
np.sum(adj)

148340.0

In [None]:
def create_dataset_for_comparison(path, adj):
    print("Creating 2 split of data")
    g = nx.Graph(adj)
    adj = nx.to_scipy_sparse_matrix(g)
    # Remove diagonal elements
    adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()

    # Split link information to train and validation with test split size
    edgelist = convertAdjMatrixToSortedRankTSV(adj.todense())
    geneids = edgelist.iloc[:, :2]
    col1 = np.array(geneids).min(axis=1).astype(int)
    col2 = np.array(geneids).max(axis=1).astype(int)
    col3 = np.array(edgelist.iloc[:, 2])
    data_df = pd.DataFrame()
    data_df['i'] = col1
    data_df['j'] = col2
    data_df['k'] = col3
    data_df = data_df.drop_duplicates()

    pos_edges = data_df.loc[data_df.iloc[:, 2] == 1]
    neg_edgelist = data_df.loc[data_df.iloc[:, 2] == 0]
    ind = random.sample(range(len(neg_edgelist)), pos_edges.shape[0])
    neg_edges = pd.DataFrame(np.random.permutation(neg_edgelist.values))
    neg_edges = neg_edges.iloc[ind, :]

#     assert set(map(tuple, pos_edges)).isdisjoint(set(map(tuple, neg_edges)))
    return pos_edges, neg_edges

In [None]:
X_pos, X_neg = create_dataset_for_comparison(path, adj)

In [None]:
X_pos, X_neg = X_pos.values, X_neg.values

In [None]:
X_pos, X_neg = X_pos.astype(int), X_neg.astype(int)

In [None]:
randomized_indices = np.random.permutation(range(len(X_pos)))

In [None]:
index_splits = split_seq(range(len(X_pos)), 2)

In [None]:
test_index = randomized_indices[index_splits[1]]

In [None]:
train_index = randomized_indices[index_splits[0]]

In [None]:
train_edges = X_pos[train_index,:]
train_edges_false = X_neg[train_index,:]
val_edges = X_pos[train_index,:]
val_edges_false = X_neg[train_index,:]
test_edges = X_pos[test_index,:]
test_edges_false = X_neg[test_index,:]

In [None]:
pd.DataFrame(train_edges).to_csv("/Users/kk3671/Documents/OpenNE/data/yeast/train_links_0.5_split_1.txt", index=False, header=False, sep=' ')

In [None]:
pd.DataFrame(test_edges).to_csv("/Users/kk3671/Documents/OpenNE/data/yeast/train_links_0.5_split_2.txt", index=False, header=False, sep=' ')

In [None]:
train_data = np.concatenate([train_edges, train_edges_false])
test_data =  np.concatenate([test_edges, test_edges_false])

In [None]:
pd.DataFrame(train_data).to_csv("/Users/kk3671/Documents/OpenNE/data/yeast/train_data_1.txt", index=False, header=False, sep=' ')
pd.DataFrame(test_data).to_csv("/Users/kk3671/Documents/OpenNE/data/yeast/train_data_2.txt", index=False, header=False, sep=' ')

In [46]:
train_data = pd.read_csv("/Users/kk3671/Documents/OpenNE/data/ecoli/train_data_1.txt", header=None, sep=' ')
test_data = pd.read_csv("/Users/kk3671/Documents/OpenNE/data/ecoli/train_data_2.txt",  header=None, sep=' ')

In [7]:
train_edges = train_data.loc[train_data.iloc[:,2]==1].values
train_edges_false = train_data.loc[train_data.iloc[:,2]==0].values
val_edges = train_data.loc[train_data.iloc[:,2]==1].values
val_edges_false = train_data.loc[train_data.iloc[:,2]==0].values
test_edges = test_data.loc[test_data.iloc[:,2]==1].values
test_edges_false = test_data.loc[test_data.iloc[:,2]==0].values

In [8]:
train_edges.shape

(73738, 3)

In [9]:
a1_rows = set(map(tuple, train_edges))

In [10]:
a2_rows = set(map(tuple, test_edges))

In [11]:
a1_rows.isdisjoint(a2_rows)

True

In [12]:
feature_file = path + 'expression_data.tsv'

In [6]:
test_split_file = open(path + "split_data_" + str(0.9) + ".pkl", 'rb')
dataset = pickle.load(test_split_file)
test_split_file.close()

In [7]:
train_edges = dataset['train_pos']
train_edges_false = dataset['train_neg']
val_edges = dataset['val_pos']
val_edges_false = dataset['val_neg']
test_edges = dataset['test_pos']
test_edges_false = dataset['test_neg']

In [10]:
# Inspect train/test split
print("Total nodes:", adj.shape[0])
print("Total edges:", np.sum(adj))  # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Training edges (positive):", len(train_edges))
print("Training edges (negative):", len(train_edges_false))
print("Validation edges (positive):", len(val_edges))
print("Validation edges (negative):", len(val_edges_false))
print("Test edges (positive):", len(test_edges))
print("Test edges (negative):", len(test_edges_false))

Total nodes: 5950
Total edges: 544652.0
Training edges (positive): 395875
Training edges (negative): 395875
Validation edges (positive): 43987
Validation edges (negative): 43987
Test edges (positive): 48874
Test edges (negative): 48874


In [11]:
validation_edges =  np.concatenate([val_edges, val_edges_false])
val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

In [18]:
Data = data.LoadData(path, train_links=train_edges, features_file=feature_file)

Constructing Nodes
attr_M: 536
id_N: 5950
Reading training links
Constructing Neighborhood maps
Constructing train data


In [19]:
len(train_edges)

395875

In [20]:
len(test_edges)

48874

In [21]:
parameters = {}
parameters['id_embedding_size'] = 128
parameters['attr_embedding_size'] = 128
parameters['batch_size'] = 128
parameters['alpha'] = 1
parameters['n_neg_samples'] = 10
parameters['epoch'] = 20
parameters['representation_size'] = 128
parameters['learning_rate'] = 0.002
parameters

{'alpha': 1,
 'attr_embedding_size': 128,
 'batch_size': 128,
 'epoch': 20,
 'id_embedding_size': 128,
 'learning_rate': 0.002,
 'n_neg_samples': 10,
 'representation_size': 128}

In [22]:
model = GNE(path, Data, 2018, parameters)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

{'id_embedding_size': 128, 'attr_embedding_size': 128, 'batch_size': 128, 'alpha': 1, 'n_neg_samples': 10, 'epoch': 20, 'representation_size': 128, 'learning_rate': 0.002}


In [23]:
embeddings = model.train(validation_edges, val_edge_labels)

Using structure and attribute embedding
Epoch:      1, Train-Batch Loss: 7.664445485, Validation AUC: 0.613114838 *
Epoch:      2, Train-Batch Loss: 5.851905940, Validation AUC: 0.672961554 *
Epoch:      3, Train-Batch Loss: 4.208199690, Validation AUC: 0.682378791 *
Epoch:      4, Train-Batch Loss: 2.759073359, Validation AUC: 0.721388262 *
Epoch:      5, Train-Batch Loss: 1.735395315, Validation AUC: 0.763078638 *
Epoch:      6, Train-Batch Loss: 1.383411345, Validation AUC: 0.770957263 *
Epoch:      7, Train-Batch Loss: 1.296350274, Validation AUC: 0.774573288 *
Epoch:      8, Train-Batch Loss: 1.237961712, Validation AUC: 0.779877368 *
Epoch:      9, Train-Batch Loss: 1.212167763, Validation AUC: 0.787016381 *
Epoch:     10, Train-Batch Loss: 1.191597908, Validation AUC: 0.790011660 *
Epoch:     11, Train-Batch Loss: 1.168718977, Validation AUC: 0.793179585 *
Epoch:     12, Train-Batch Loss: 1.165197685, Validation AUC: 0.795553109 *
Epoch:     13, Train-Batch Loss: 1.136695454, Va

In [24]:
import pandas as pd
pd.DataFrame(embeddings).to_csv("embeddings_ecoli.txt", header=False, index=False, sep=",")

In [2]:
embeddings = pd.read_csv("embeddings_ecoli.txt", header=None, sep=",").values

In [12]:
train_edges

array([[9.940e+02, 4.797e+03, 1.000e+00],
       [5.336e+03, 5.768e+03, 1.000e+00],
       [7.350e+02, 5.308e+03, 1.000e+00],
       ...,
       [4.210e+02, 1.777e+03, 1.000e+00],
       [3.758e+03, 4.693e+03, 1.000e+00],
       [1.625e+03, 3.283e+03, 1.000e+00]])

In [13]:
embeddings.shape

(5950, 128)

In [14]:
# Train-set edge embeddings
pos_train_edge_embs = get_edge_embeddings(embeddings, train_edges)
neg_train_edge_embs = get_edge_embeddings(embeddings, train_edges_false)
train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])
# Create train-set edge labels: 1 = real edge, 0 = false edge
train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

In [15]:
# Test-set edge embeddings, labels
pos_test_edge_embs = get_edge_embeddings(embeddings, test_edges)
neg_test_edge_embs = get_edge_embeddings(embeddings, test_edges_false)
test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

# Create val-set edge labels: 1 = real edge, 0 = false edge
test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

In [16]:
index = np.random.permutation([i for i in range(len(train_edge_labels))])
train_data = train_edge_embs[index,:]
train_labels = train_edge_labels[index]

In [17]:
index = np.random.permutation([i for i in range(len(test_edge_labels))])
test_data = test_edge_embs[index,:]
test_labels = test_edge_labels[index]

In [18]:
# Train logistic regression classifier on train-set edge embeddings
from sklearn.linear_model import LogisticRegression

edge_classifier = LogisticRegression(random_state=0)
edge_classifier.fit(train_data, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
test_preds = edge_classifier.predict_proba(test_data)[:, 1]
test_roc = roc_auc_score(test_labels, test_preds)
test_ap = average_precision_score(test_labels, test_preds)

In [None]:
print('GNE Test ROC score: ', str(test_roc))
print('GNE Test AP score: ', str(test_ap))

In [None]:
# Ecoli
#GNE Test ROC score:  0.940158504921
#GNE Test AP score:  0.93897040116

In [None]:
# yeast
# GNE Test ROC score:  0.821812518988
# GNE Test AP score:  0.80728890868

In [None]:
# Ecoli
# Only using attribute embeddings with 90% training interactions
# GNE Test ROC score:  0.94749334705
# GNE Test AP score:  0.940346079826

In [None]:
# Yeast
# Only using attribute embeddings with 90% training interactions
# GNE Test ROC score:  0.841304460612
# GNE Test AP score:  0.83745133239

In [40]:
evaluation_data = pd.read_csv("/Users/kk3671/Documents/yeast_evaluation_data.txt", header=None, sep=" ")

In [41]:
test_edges = evaluation_data.iloc[:,:2].values

In [42]:
test_edges

array([[ 339, 4264],
       [ 649, 2147],
       [4216, 1904],
       ...,
       [3962, 1959],
       [2822,  435],
       [3111, 2705]])

In [22]:
test_edge_embs = get_edge_embeddings(embeddings, test_edges)

In [23]:
test_edge_labels = evaluation_data.iloc[:,2].values

In [24]:
test_edge_labels

array([1, 1, 1, ..., 0, 0, 0])

In [25]:

test_data = test_edge_embs
test_labels = test_edge_labels

In [26]:
test_preds = edge_classifier.predict_proba(test_data)[:, 1]
test_roc = roc_auc_score(test_labels, test_preds)
test_ap = average_precision_score(test_labels, test_preds)

In [27]:
print('GNE Test ROC score: ', str(test_roc))
print('GNE Test AP score: ', str(test_ap))

GNE Test ROC score:  0.5090662132803053
GNE Test AP score:  0.5051738015186562


In [48]:
test_preds

array([0.85622438, 0.10829569, 0.58840283, ..., 0.11889335, 0.10060936,
       0.07469195])

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(test_labels, test_preds)

plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
                 color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(test_ap))

In [None]:
plt.show()

In [None]:
thresholds

In [None]:
test_preds = edge_classifier.predict(test_data)

In [None]:
confusion_matrix(test_labels, test_preds)

In [None]:
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Tahoma']
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_ylabel(r"Recall", fontsize=16)
# ax.axhline(y=0.839)
ax.set_xlabel("Precision", fontsize=16)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

precision, recall, thresholds = precision_recall_curve(test_labels, test_preds)

ax.plot(recall, precision, color='b', alpha=1, label ="Precision-Recall curve: area={0:0.2f}".format(test_ap))
ax.legend(fontsize='large')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.savefig('ecoli_pr_curve.eps', dpi =1200)
plt.show()

In [28]:
df = evaluation_data

In [29]:
df

Unnamed: 0,0,1,2
0,339,4264,1
1,649,2147,1
2,4216,1904,1
3,1237,3801,1
4,561,3809,1
5,226,4198,1
6,3441,3750,1
7,2958,2377,1
8,3689,1729,1
9,3720,613,1


In [30]:
df['predicted'] = test_preds

In [31]:
df = df.sort_values(['predicted'], ascending=False)

In [32]:
test_roc = roc_auc_score(df.iloc[:,2], df.iloc[:,3])
test_ap = average_precision_score(df.iloc[:,2], df.iloc[:,3])

In [33]:
print('GNE Test ROC score: ', str(test_roc))
print('GNE Test AP score: ', str(test_ap))

GNE Test ROC score:  0.5090662132803053
GNE Test AP score:  0.5051738015186562


In [34]:
df.to_csv("../Gene-Network-Embedding/data/yeast/latest_predictions.txt", index=False, header=False, sep=",")