In [1]:
import pickle 
import LoadData as data
import numpy as np
from GNE import GNE
from evaluation import *
from utils import *
import pandas as pd
import scipy.sparse as sp
import random

In [2]:
path = './data/ecoli/'

In [3]:
geneids = pd.read_csv(path + "gene_ids.tsv", sep=" ")
num_genes = geneids.shape[0]
link_file = path + "edgelist_biogrid.txt"

adj = load_network(link_file, num_genes)

### Loading [./data/ecoli/edgelist_biogrid.txt]...


In [4]:
np.sum(adj)

148340.0

In [5]:
def create_dataset_for_generalization(adj, validation_size = None, gene_split=None):
    # Split link information to train and validation with test split size
    edgelist = convertAdjMatrixToSortedRankTSV(adj)
    geneids = edgelist.iloc[:, :2]
    col1 = np.array(geneids).min(axis=1).astype(int)
    col2 = np.array(geneids).max(axis=1).astype(int)
    col3 = np.array(edgelist.iloc[:, 2])
    if gene_split != None:
        col1 = col1 + gene_split
        col2 = col2 + gene_split
    data_df = pd.DataFrame()
    data_df['i'] = col1
    data_df['j'] = col2
    data_df['k'] = col3
    data_df = data_df.drop_duplicates()

    train_edges = data_df.loc[data_df.iloc[:, 2] == 1].values
    neg_edgelist = data_df.loc[data_df.iloc[:, 2] == 0]
    ind = random.sample(range(len(neg_edgelist)), train_edges.shape[0])
    neg_edges = pd.DataFrame(np.random.permutation(neg_edgelist.values))
    train_edges_false = neg_edges.iloc[ind, :].values
    dataset = {}
    dataset['train_pos'] = train_edges
    dataset['train_neg'] = train_edges_false
    if(validation_size != None):
        train_edges, val_edges = train_test_split(train_edges, test_size=validation_size)
        train_edges_false, val_edges_false = train_test_split(train_edges_false, test_size=validation_size)
        dataset['train_pos'] = train_edges
        dataset['train_neg'] = train_edges_false
        dataset['val_pos'] = val_edges
        dataset['val_neg'] = val_edges_false
    return dataset

In [55]:
g = nx.Graph(adj)
adj = nx.to_scipy_sparse_matrix(g)
# Remove diagonal elements
adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
adj.eliminate_zeros()

adj_matrix = adj.todense()
train_genes = int(len(adj_matrix)*.75)
train_adj = adj_matrix[:train_genes,:train_genes]

In [56]:
train_adj.shape

(3383, 3383)

In [57]:
test_adj = adj_matrix[train_genes:,train_genes:]
test_adj.shape

(1128, 1128)

In [59]:
train_dataset = create_dataset_for_generalization(train_adj, validation_size=0.1)

In [60]:
train_edges = train_dataset['train_pos']
train_edges_false = train_dataset['train_neg']
val_edges = train_dataset['val_pos']
val_edges_false = train_dataset['val_neg']

# Inspect train/test split
print("Total nodes:", train_adj.shape[0])
print("Total edges:", np.sum(train_adj))  # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Training edges (positive):", len(train_edges))
print("Training edges (negative):", len(train_edges_false))
print("Validation edges (positive):", len(val_edges))
print("Validation edges (negative):", len(val_edges_false))

Total nodes: 3383
Total edges: 158930.0
Training edges (positive): 71518
Training edges (negative): 71518
Validation edges (positive): 7947
Validation edges (negative): 7947


In [61]:
test_dataset = create_dataset_for_generalization(test_adj, validation_size=None, gene_split=train_adj.shape[0])

In [62]:
test_edges = test_dataset['train_pos']
test_edges_false = test_dataset['train_neg']

# Inspect train/test split
print("Total nodes:", test_adj.shape[0])
print("Total edges:", np.sum(test_adj))  # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Test edges (positive):", len(test_edges))
print("Test edges (negative):", len(test_edges_false))

Total nodes: 1128
Total edges: 21046.0
Test edges (positive): 10523
Test edges (negative): 10523


In [63]:
a1_rows = set(map(tuple, train_edges))

In [64]:
a2_rows = set(map(tuple, test_edges))

In [65]:
a1_rows.isdisjoint(a2_rows)

True

In [66]:
a1_rows = set(map(tuple, train_edges_false))
a2_rows = set(map(tuple, test_edges_false))
a1_rows.isdisjoint(a2_rows)

True

In [67]:
feature_file = path + 'expression_data.tsv'

In [68]:
Data = data.LoadData(path, train_links=train_edges, features_file=feature_file)

Constructing Nodes
attr_M: 805
id_N: 4511
Reading training links
Constructing Neighborhood maps
Constructing train data


In [69]:
len(train_edges)

71518

In [70]:
len(test_edges)

10523

In [71]:
validation_edges =  np.concatenate([val_edges, val_edges_false])
val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

In [72]:
parameters = {}
parameters['id_embedding_size'] = 128
parameters['attr_embedding_size'] = 128
parameters['batch_size'] = 128
parameters['alpha'] = 1
parameters['n_neg_samples'] = 10
parameters['epoch'] = 20
parameters['representation_size'] = 128
parameters['learning_rate'] = 0.002
parameters

{'alpha': 1,
 'attr_embedding_size': 128,
 'batch_size': 128,
 'epoch': 20,
 'id_embedding_size': 128,
 'learning_rate': 0.002,
 'n_neg_samples': 10,
 'representation_size': 128}

In [73]:
model = GNE(path, Data, 2018, parameters)

{'id_embedding_size': 128, 'attr_embedding_size': 128, 'batch_size': 128, 'alpha': 1, 'n_neg_samples': 10, 'epoch': 20, 'representation_size': 128, 'learning_rate': 0.002}


In [74]:
embeddings, attr_weights = model.train(validation_edges, val_edge_labels)

Using structure and attribute embedding
Epoch:      1, Train-Batch Loss: 7.890777881, Validation AUC: 0.586608005 *
Epoch:      2, Train-Batch Loss: 4.407956974, Validation AUC: 0.622778734 *


KeyboardInterrupt: 

In [None]:
attr_weights.shape

In [None]:
import pandas as pd
pd.DataFrame(attr_weights).to_csv("attribute_weights_ecoli.txt", header=False, index=False, sep=",")

In [None]:
datafile = path + 'data_standard.txt'

In [None]:
attributes = pd.read_csv(datafile, index_col=0, header=None, sep=" ")

In [None]:
attributes.shape

In [None]:
embeddings =  np.dot(attributes, attr_weights)

In [None]:
# embeddings = attributes.values

In [None]:
embeddings.shape

In [None]:
# Train-set edge embeddings
pos_train_edge_embs = get_edge_embeddings(embeddings, train_edges)
neg_train_edge_embs = get_edge_embeddings(embeddings, train_edges_false)
train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])
# Create train-set edge labels: 1 = real edge, 0 = false edge
train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

In [None]:
# Test-set edge embeddings, labels
pos_test_edge_embs = get_edge_embeddings(embeddings, test_edges)
neg_test_edge_embs = get_edge_embeddings(embeddings, test_edges_false)
test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

# Create val-set edge labels: 1 = real edge, 0 = false edge
test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

In [None]:
index = np.random.permutation([i for i in range(len(train_edge_labels))])
train_data = train_edge_embs[index,:]
train_labels = train_edge_labels[index]

In [None]:
index = np.random.permutation([i for i in range(len(test_edge_labels))])
test_data = test_edge_embs[index,:]
test_labels = test_edge_labels[index]

In [None]:
# Train logistic regression classifier on train-set edge embeddings
from sklearn.linear_model import LogisticRegression

edge_classifier = LogisticRegression(random_state=0)
edge_classifier.fit(train_data, train_labels)

In [None]:
test_preds = edge_classifier.predict_proba(test_data)[:, 1]
test_roc = roc_auc_score(test_labels, test_preds)
test_ap = average_precision_score(test_labels, test_preds)

In [None]:
print('Test ROC score: ', str(test_roc))
print('Test AP score: ', str(test_ap))

In [None]:
# Yeast
# Only using attribute embeddings with 90% nodes as training to predict interactions of remaining 10% nodes
# Test ROC score:  0.636207024495
# Test AP score:  0.619662145155

# logistic regression of expression data 
# Test ROC score:  0.618279048852
# Test AP score:  0.597478518784

# correlation
# Test ROC score:  0.568089255807
# Test AP score:  0.568394943682

In [None]:
# Yeast
# Only using attribute embeddings with 75% nodes as training to predict interactions of remaining 25% nodes
# Test ROC score:  0.64579528764
# Test AP score:  0.629618217338
# logistic regression of expression data 
# Test ROC score:  0.639091808068
# Test AP score:  0.615704858074
# correlation
# Test ROC score:  0.579982316533
# Test AP score:  0.57969109199

In [None]:
# Yeast
# Only using attribute embeddings with 50% nodes as training to predict interactions of remaining 50% nodes
# Test ROC score:  0.632323760642
# Test AP score:  0.618320422841
# logistic regression of expression data 
# Test ROC score:  0.637106649428
# Test AP score:  0.617446089276
# correlation
# Test ROC score:  0.577860323413
# Test AP score:  0.579623105575

In [None]:
# Ecoli
# Only using attribute embeddings with 90% nodes as training to predict interactions of remaining 10% nodes
# Test ROC score:  0.637298544227
# Test AP score:  0.624893907808
# logistic regression of expression data 
# Test ROC score:  0.630899399184
# Test AP score:  0.613969657994
# correlation
# Test ROC score:  0.536818673513
# Test AP score:  0.557680007943

In [None]:
# Ecoli
# Only using attribute embeddings with 75% nodes as training to predict interactions of remaining 25% nodes

# logistic regression of expression data 


# correlation



In [None]:
# Ecoli
# Only using attribute embeddings with 50% nodes as training to predict interactions of remaining 50% nodes

# logistic regression of expression data 


# correlation



In [None]:
corr = np.corrcoef(attributes)

In [None]:
test_edges_cor =  np.concatenate([test_edges, test_edges_false]).astype(int)
test_edge_labels_cor = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

In [None]:
y_predict = []
for i in range(len(test_edges_cor)):
    index = test_edges_cor[i,:]
    row, col = index[0], index[1]
    y_predict.append(corr[row, col])

In [None]:
test_roc = roc_auc_score(test_edge_labels_cor, y_predict)
test_ap = average_precision_score(test_edge_labels_cor, y_predict)

In [None]:
print('Test ROC score: ', str(test_roc))
print('Test AP score: ', str(test_ap))