In [1]:
import time
import os
import sys
sys.path.insert(0, "/home/dada/jupyter/sdml/hw1/gae")

In [2]:
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp

In [3]:
import tensorflow as tf
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

from gae.optimizer import OptimizerAE, OptimizerVAE
from gae.input_data import load_data
from gae.model import GCNModelAE, GCNModelVAE
from gae.preprocessing import preprocess_graph, construct_feed_dict, sparse_to_tuple, mask_test_edges

In [4]:
tf.__version__

'1.8.0'

In [5]:
# setting
epochs = 200
dropout = 0.

In [6]:
G_train = nx.read_edgelist('../../t1-train.txt', create_using=nx.DiGraph(), nodetype=int, edgetype=int)
G_test_seen = nx.read_edgelist('../../t1-test-seen.txt', create_using=nx.DiGraph(), nodetype=int, edgetype=int)
G_test = np.loadtxt('../../t1-test.txt')

G = nx.DiGraph()
#G.add_edges_from(G_train.edges)
G.add_edges_from(G_test_seen.edges)
nodes = [i for i in G.nodes]

In [7]:
adj = nx.adjacency_matrix(G)
# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
adj = adj_train

In [8]:
features = sp.identity(adj.shape[0])  # featureless

In [9]:
# Some preprocessing
adj_norm = preprocess_graph(adj)

In [10]:
# Define placeholders
placeholders = {
    'features': tf.sparse_placeholder(tf.float32),
    'adj': tf.sparse_placeholder(tf.float32),
    'adj_orig': tf.sparse_placeholder(tf.float32),
    'dropout': tf.placeholder_with_default(0., shape=())
}

In [11]:
num_nodes = adj.shape[0]
features = sparse_to_tuple(features.tocoo())
num_features = features[2][1]
features_nonzero = features[1].shape[0]

In [12]:
from gae.layers import GraphConvolution, GraphConvolutionSparse, InnerProductDecoder

class Model(object):
    def __init__(self, **kwargs):
        allowed_kwargs = {'name', 'logging'}
        for kwarg in kwargs.keys():
            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg

        for kwarg in kwargs.keys():
            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
        name = kwargs.get('name')
        if not name:
            name = self.__class__.__name__.lower()
        self.name = name

        logging = kwargs.get('logging', False)
        self.logging = logging

        self.vars = {}

    def _build(self):
        raise NotImplementedError

    def build(self):
        """ Wrapper for _build() """
        with tf.variable_scope(self.name):
            self._build()
        variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
        self.vars = {var.name: var for var in variables}

    def fit(self):
        pass

    def predict(self):
        pass

    
    
class GCNModelVAE(Model):
    def __init__(self, placeholders, num_features, num_nodes, features_nonzero, **kwargs):
        super(GCNModelVAE, self).__init__(**kwargs)

        self.inputs = placeholders['features']
        self.input_dim = num_features
        self.features_nonzero = features_nonzero
        self.n_samples = num_nodes
        self.adj = placeholders['adj']
        self.dropout = placeholders['dropout']
        self.build()

    def _build(self):
        self.hidden1 = GraphConvolutionSparse(input_dim=self.input_dim,
                                              output_dim=32,
                                              adj=self.adj,
                                              features_nonzero=self.features_nonzero,
                                              act=tf.nn.relu,
                                              dropout=self.dropout,
                                              logging=self.logging)(self.inputs)

        self.z_mean = GraphConvolution(input_dim=32,
                                       output_dim=16,
                                       adj=self.adj,
                                       act=lambda x: x,
                                       dropout=self.dropout,
                                       logging=self.logging)(self.hidden1)

        self.z_log_std = GraphConvolution(input_dim=32,
                                          output_dim=16,
                                          adj=self.adj,
                                          act=lambda x: x,
                                          dropout=self.dropout,
                                          logging=self.logging)(self.hidden1)

        self.z = self.z_mean + tf.random_normal([self.n_samples, 16]) * tf.exp(self.z_log_std)

        self.reconstructions = InnerProductDecoder(input_dim=16,
                                      act=lambda x: x,
                                      logging=self.logging)(self.z)

In [13]:
model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero)

In [14]:
pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

In [15]:
class OptimizerVAE(object):
    def __init__(self, preds, labels, model, num_nodes, pos_weight, norm):
        preds_sub = preds
        labels_sub = labels

        self.cost = norm * tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.01)  # Adam Optimizer

        # Latent loss
        self.log_lik = self.cost
        self.kl = (0.5 / num_nodes) * tf.reduce_mean(tf.reduce_sum(1 + 2 * model.z_log_std - tf.square(model.z_mean) -
                                                                   tf.square(tf.exp(model.z_log_std)), 1))
        self.cost -= self.kl

        self.opt_op = self.optimizer.minimize(self.cost)
        self.grads_vars = self.optimizer.compute_gradients(self.cost)

        self.correct_prediction = tf.equal(tf.cast(tf.greater_equal(tf.sigmoid(preds_sub), 0.5), tf.int32),
                                           tf.cast(labels_sub, tf.int32))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))

In [16]:
opt = OptimizerVAE(preds=model.reconstructions,
                   labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                               validate_indices=False), [-1]),
                   model=model, num_nodes=num_nodes,
                   pos_weight=pos_weight,
                   norm=norm)

In [17]:
def get_roc_score(edges_pos, edges_neg, emb=None):
    if emb is None:
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean, feed_dict=feed_dict)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # Predict on test set of edges
    adj_rec = np.dot(emb, emb.T)
    preds = []
    pos = []
    for e in edges_pos:
        preds.append(sigmoid(adj_rec[e[0], e[1]]))
        pos.append(adj_orig[e[0], e[1]])

    preds_neg = []
    neg = []
    for e in edges_neg:
        preds_neg.append(sigmoid(adj_rec[e[0], e[1]]))
        neg.append(adj_orig[e[0], e[1]])

    preds_all = np.hstack([preds, preds_neg])
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds))])
    
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)
    acc_score = accuracy_score(labels_all, preds_all)

    return roc_score, ap_score, acc_score

In [18]:
# Initialize session
os.environ['CUDA_VISIBLE_DEVICES'] = ""
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
sess = tf.Session()#config=tf.ConfigProto(gpu_options=gpu_options)
sess.run(tf.global_variables_initializer())

In [None]:
cost_val = []
acc_val = []
val_roc_score = []

adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = sparse_to_tuple(adj_label)

# Train model
for epoch in range(epochs):

    t = time.time()
    # Construct feed dictionary
    feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
    feed_dict.update({placeholders['dropout']: dropout})
    # Run single weight update
    outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

    # Compute average loss
    avg_cost = outs[1]
    avg_accuracy = outs[2]

    roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false)
    val_roc_score.append(roc_curr)

    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
          "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]),
          "val_ap=", "{:.5f}".format(ap_curr),
          "time=", "{:.5f}".format(time.time() - t))

print("Optimization Finished!")

roc_score, ap_score = get_roc_score(test_edges, test_edges_false)
print('Test ROC score: ' + str(roc_score))
print('Test AP score: ' + str(ap_score))

In [1]:
%store -r GG
%store -r all_deg_cent
%store -r all_in_cent
%store -r all_out_cent
%store -r all_b_cent
%store -r all_l_cent
%store -r all_tri
%store -r all_kc
%store -r all_page

In [5]:
def extract_feat(node):
    feat = []
    feat.append(GG.degree(node))
    feat.append(GG.in_degree(node))
    feat.append(GG.out_degree(node))
    
    feat.append(all_deg_cent[node] if node in all_deg_cent else 0)
    feat.append(all_in_cent[node] if node in all_in_cent else 0)
    feat.append(all_out_cent[node] if node in all_out_cent else 0)
    feat.append(all_b_cent[node] if node in all_b_cent else 0)
    feat.append(all_l_cent[node] if node in all_l_cent else 0)

    feat.append(all_tri[node] if node in all_tri else 0)
    feat.append(all_kc[node] if node in all_kc else 0)
    feat.append(all_page[node] if node in all_page else 0)
    return feat

In [7]:
node_feat = {}
for node in GG.nodes:
    node_feat[node] = extract_feat(node)

In [13]:
len(node_feat)

29402

In [12]:
import pickle
with open('node_feat_dict.pkl', 'wb') as f:
    pickle.dump(node_feat, f)