In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [6]:
from __future__ import print_function
import numpy as np
import random
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from sklearn.linear_model import LogisticRegression
import time
import ast


import networkx as nx
import pickle as pkl
import numpy as np
import scipy.sparse as sp
import tensorflow as tf

from collections import namedtuple
import math

In [7]:
class Graph(object):
    def __init__(self):
        self.G = None
        self.look_up_dict = {}
        self.look_back_list = []
        self.node_size = 0

    def encode_node(self):
        look_up = self.look_up_dict
        look_back = self.look_back_list
        for node in self.G.nodes():
            look_up[node] = self.node_size
            look_back.append(node)
            self.node_size += 1
            self.G.nodes[node]['status'] = ''

    def read_g(self, g):
        self.G = g
        self.encode_node()

    def read_adjlist(self, filename):
        """ Read graph from adjacency file in which the edge must be unweighted
            the format of each line: v1 n1 n2 n3 ... nk
            :param filename: the filename of input file
        """
        self.G = nx.read_adjlist(filename, create_using=nx.DiGraph())
        for i, j in self.G.edges():
            self.G[i][j]['weight'] = 1.0
        self.encode_node()

    def read_edgelist(self, filename, weighted=False, directed=False):
        self.G = nx.DiGraph()

        if directed:
            def read_unweighted(l):
                src, dst = l.split()
                self.G.add_edge(src, dst)
                self.G[src][dst]['weight'] = 1.0

            def read_weighted(l):
                src, dst, w = l.split()
                self.G.add_edge(src, dst)
                self.G[src][dst]['weight'] = float(w)
        else:
            def read_unweighted(l):
                src, dst = l.split()
                self.G.add_edge(src, dst)
                self.G.add_edge(dst, src)
                self.G[src][dst]['weight'] = 1.0
                self.G[dst][src]['weight'] = 1.0

            def read_weighted(l):
                src, dst, w = l.split()
                self.G.add_edge(src, dst)
                self.G.add_edge(dst, src)
                self.G[src][dst]['weight'] = float(w)
                self.G[dst][src]['weight'] = float(w)
        fin = open(filename, 'r')
        func = read_unweighted
        if weighted:
            func = read_weighted
        while 1:
            l = fin.readline()
            if l == '':
                break
            func(l)
        fin.close()
        self.encode_node()

    def read_node_label(self, filename):
        fin = open(filename, 'r')
        while 1:
            l = fin.readline()
            if l == '':
                break
            vec = l.split()
            self.G.nodes[vec[0]]['label'] = vec[1:]
        fin.close()

    def read_node_features(self, filename):
        fin = open(filename, 'r')
        for l in fin.readlines():
            vec = l.split()
            self.G.nodes[vec[0]]['feature'] = np.array(
                [float(x) for x in vec[1:]])
        fin.close()

    def read_node_status(self, filename):
        fin = open(filename, 'r')
        while 1:
            l = fin.readline()
            if l == '':
                break
            vec = l.split()
            self.G.nodes[vec[0]]['status'] = vec[1]  # train test valid
        fin.close()

    def read_edge_label(self, filename):
        fin = open(filename, 'r')
        while 1:
            l = fin.readline()
            if l == '':
                break
            vec = l.split()
            self.G[vec[0]][vec[1]]['label'] = vec[2:]
        fin.close()

    
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import numpy as np
import numpy 

class Classifier(object):

    def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[int(x)] for x in X]
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y]
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro", "macro", "samples", "weighted"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        results['accuracy'] = accuracy_score(Y, Y_)
        
        # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
        # print('-------------------')
        #print(results)
        return results
        # print('-------------------')

    def predict(self, X, top_k_list):
        X_ = numpy.asarray([self.embeddings[int(x)] for x in X])
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def split_train_evaluate(self, X, Y, train_precent, seed=0):
        state = numpy.random.get_state()

        training_size = int(train_precent * len(X))
        numpy.random.seed(seed)
        shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]

        self.train(X_train, Y_train, Y)
        numpy.random.set_state(state)
        return self.evaluate(X_test, Y_test)
    

label_file = './../../../data/cora/cora_labels.txt'


#label_file = 'group_clean.txt'
def read_node_label(filename):
    fin = open(filename, 'r')
    X = []
    Y = []
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        X.append(vec[0])
        Y.append(vec[1:])
    fin.close()
    return X, Y
#vectors = model.vectors
#vectors = embedding
#vectors = w2v.vectors
#vectors = embeddings_concat_norm
#X, Y = read_node_label(label_file)

from sklearn.multiclass import OneVsRestClassifier
class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            probs_[:] = 0
            probs_[labels] = 1
            all_labels.append(probs_)
        return numpy.asarray(all_labels)


import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
    
def glorot(shape, name=None):
    """Glorot & Bengio (AISTATS 2010) init."""
    init_range = np.sqrt(6.0/(shape[0]+shape[1]))
    initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
    return tf.Variable(initial, name=name)

class Model(object):
    def __init__(self, **kwargs):
        allowed_kwargs = {'name', 'logging', 'model_size'}
        for kwarg in kwargs.keys():
            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
        name = kwargs.get('name')
        if not name:
            name = self.__class__.__name__.lower()
        self.name = name

        logging = kwargs.get('logging', False)
        self.logging = logging

        self.vars = {}
        self.placeholders = {}

        self.layers = []
        self.activations = []

        self.inputs = None
        self.outputs = None

        self.loss = 0
        self.accuracy = 0
        self.optimizer = None
        self.opt_op = None

    def _build(self):
        raise NotImplementedError

    def build(self):
        """ Wrapper for _build() """
        with tf.variable_scope(self.name):
            self._build()

        # Build sequential layer model
        self.activations.append(self.inputs)
        for layer in self.layers:
            hidden = layer(self.activations[-1])
            self.activations.append(hidden)
        self.outputs = self.activations[-1]

        # Store model variables for easy access
        variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
        self.vars = {var.name: var for var in variables}

        # Build metrics
        self._loss()
        self._accuracy()

        self.opt_op = self.optimizer.minimize(self.loss)

    def predict(self):
        pass

    def _loss(self):
        raise NotImplementedError

    def _accuracy(self):
        raise NotImplementedError

    def save(self, sess=None):
        if not sess:
            raise AttributeError("TensorFlow session not provided.")
        saver = tf.train.Saver(self.vars)
        save_path = saver.save(sess, "tmp/%s.ckpt" % self.name)
        print("Model saved in file: %s" % save_path)

    def load(self, sess=None):
        if not sess:
            raise AttributeError("TensorFlow session not provided.")
        saver = tf.train.Saver(self.vars)
        save_path = "tmp/%s.ckpt" % self.name
        saver.restore(sess, save_path)
        print("Model restored from file: %s" % save_path)


class MLP(Model):
    """ A standard multi-layer perceptron """
    def __init__(self, placeholders, dims, categorical=True, **kwargs):
        super(MLP, self).__init__(**kwargs)

        self.dims = dims
        self.input_dim = dims[0]
        self.output_dim = dims[-1]
        self.placeholders = placeholders
        self.categorical = categorical

        self.inputs = placeholders['features']
        self.labels = placeholders['labels']

        self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)

        self.build()

    def _loss(self):
        # Weight decay loss
        for var in self.layers[0].vars.values():
            self.loss += weight_decay * tf.nn.l2_loss(var)

        # Cross entropy error
        if self.categorical:
            self.loss += metrics.masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'],
                    self.placeholders['labels_mask'])
        # L2
        else:
            diff = self.labels - self.outputs
            self.loss += tf.reduce_sum(tf.sqrt(tf.reduce_sum(diff * diff, axis=1)))

    def _accuracy(self):
        if self.categorical:
            self.accuracy = metrics.masked_accuracy(self.outputs, self.placeholders['labels'],
                    self.placeholders['labels_mask'])

    def _build(self):
        self.layers.append(layers.Dense(input_dim=self.input_dim,
                                 output_dim=self.dims[1],
                                 act=tf.nn.relu,
                                 dropout=self.placeholders['dropout'],
                                 sparse_inputs=False,
                                 logging=self.logging))

        self.layers.append(layers.Dense(input_dim=self.dims[1],
                                 output_dim=self.output_dim,
                                 act=lambda x: x,
                                 dropout=self.placeholders['dropout'],
                                 logging=self.logging))

    def predict(self):
        return tf.nn.softmax(self.outputs)

class GeneralizedModel(Model):
    """
    Base class for models that aren't constructed from traditional, sequential layers.
    Subclasses must set self.outputs in _build method
    (Removes the layers idiom from build method of the Model class)
    """

    def __init__(self, **kwargs):
        super(GeneralizedModel, self).__init__(**kwargs)
        

    def build(self):
        """ Wrapper for _build() """
        with tf.variable_scope(self.name, reuse=True):
            self._build()

        # Store model variables for easy access
        variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
        self.vars = {var.name: var for var in variables}

        # Build metrics
        self._loss()
        self._accuracy()

        self.opt_op = self.optimizer.minimize(self.loss)

# SAGEInfo is a namedtuple that specifies the parameters 
# of the recursive GraphSAGE layers
SAGEInfo = namedtuple("SAGEInfo",
    ['layer_name', # name of the layer (to get feature embedding etc.)
     'neigh_sampler', # callable neigh_sampler constructor
     'num_samples',
     'output_dim' # the output (i.e., hidden) dimension
    ])



class UnigramTable:
    """
    Using weight list to initialize the drawing 
    """
    def __init__(self, vocab, power=0.75):
        vocab_size = len(vocab)
        norm = sum([math.pow(t, power) for t in vocab]) # Normalizing constant

        table_size = int(1e8) # Length of the unigram table
        table = np.zeros(table_size, dtype=np.uint32)

        print('Filling unigram table')
        p = 0 # Cumulative probability
        i = 0
        for t in range(vocab_size):
            p += float(math.pow(vocab[t], power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = t
                i += 1
        self.table = table
        print('Finish filling unigram table')

    def sample(self, count):
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]
  

        
_LAYER_UIDS = {}

def get_layer_uid(layer_name=''):
    """Helper function, assigns unique layer IDs."""
    if layer_name not in _LAYER_UIDS:
        _LAYER_UIDS[layer_name] = 1
        return 1
    else:
        _LAYER_UIDS[layer_name] += 1
        return _LAYER_UIDS[layer_name]

class Layer(object):
    """Base layer class. Defines basic API for all layer objects.
    Implementation inspired by keras (http://keras.io).
    # Properties
        name: String, defines the variable scope of the layer.
        logging: Boolean, switches Tensorflow histogram logging on/off
    # Methods
        _call(inputs): Defines computation graph of layer
            (i.e. takes input, returns output)
        __call__(inputs): Wrapper for _call()
        _log_vars(): Log all variables
    """

    def __init__(self, **kwargs):
        allowed_kwargs = {'name', 'logging', 'model_size'}
        for kwarg in kwargs.keys():
            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
        name = kwargs.get('name')
        if not name:
            layer = self.__class__.__name__.lower()
            name = layer + '_' + str(get_layer_uid(layer))
        self.name = name
        self.vars = {}
        logging = kwargs.get('logging', False)
        self.logging = logging
        self.sparse_inputs = False

    def _call(self, inputs):
        return inputs

    def __call__(self, inputs):
        with tf.name_scope(self.name):
            if self.logging and not self.sparse_inputs:
                tf.summary.histogram(self.name + '/inputs', inputs)
            outputs = self._call(inputs)
            if self.logging:
                tf.summary.histogram(self.name + '/outputs', outputs)
            return outputs

    def _log_vars(self):
        for var in self.vars:
            tf.summary.histogram(self.name + '/vars/' + var, self.vars[var])
            
def zeros(shape, name=None):
    """All zeros."""
    initial = tf.zeros(shape, dtype=tf.float32)
    return tf.Variable(initial, name=name)

class Dense(Layer):
    """Dense layer."""
    def __init__(self, input_dim, output_dim, dropout=0., 
                 act=tf.nn.relu, placeholders=None, bias=True, featureless=False, 
                 sparse_inputs=False, **kwargs):
        super(Dense, self).__init__(**kwargs)

        self.dropout = dropout

        self.act = act
        self.featureless = featureless
        self.bias = bias
        self.input_dim = input_dim
        self.output_dim = output_dim

        # helper variable for sparse dropout
        self.sparse_inputs = sparse_inputs
        if sparse_inputs:
            self.num_features_nonzero = placeholders['num_features_nonzero']

        with tf.variable_scope(self.name + '_vars'):
            self.vars['weights'] = tf.get_variable('weights', shape=(input_dim, output_dim),
                                         dtype=tf.float32, 
                                         initializer=tf.contrib.layers.xavier_initializer(),
                                         regularizer=tf.contrib.layers.l2_regularizer(weight_decay))
            if self.bias:
                self.vars['bias'] = zeros([output_dim], name='bias')

        if self.logging:
            self._log_vars()

    def _call(self, inputs):
        x = inputs

        x = tf.nn.dropout(x, 1-self.dropout)

        # transform
        output = tf.matmul(x, self.vars['weights'])

        # bias
        if self.bias:
            output += self.vars['bias']

        return self.act(output)
    
class BipartiteEdgePredLayer(Layer):
    def __init__(self, input_dim1, input_dim2, placeholders, dropout=False, act=tf.nn.sigmoid,
            loss_fn='xent', neg_sample_weights=1.0,
            bias=False, bilinear_weights=False, **kwargs):
        """
        Basic class that applies skip-gram-like loss
        (i.e., dot product of node+target and node and negative samples)
        Args:
            bilinear_weights: use a bilinear weight for affinity calculation: u^T A v. If set to
                false, it is assumed that input dimensions are the same and the affinity will be 
                based on dot product.
        """
        super(BipartiteEdgePredLayer, self).__init__(**kwargs)
        self.input_dim1 = input_dim1
        self.input_dim2 = input_dim2
        self.act = act
        self.bias = bias
        self.eps = 1e-7

        # Margin for hinge loss
        self.margin = 0.1
        self.neg_sample_weights = neg_sample_weights

        self.bilinear_weights = bilinear_weights

        if dropout:
            self.dropout = placeholders['dropout']
        else:
            self.dropout = 0.

        self.neg_sample_size = placeholders["neg_sample_size"]
        self.batch_size = placeholders["batch_size"]
        
        print("self.neg_sample_size ",self.neg_sample_size)
        print("self.batch_size ",self.batch_size)
        
        # output a likelihood term
        self.output_dim = 1
        with tf.variable_scope(self.name + '_vars'):
            # bilinear form
            if bilinear_weights:
                #self.vars['weights'] = glorot([input_dim1, input_dim2],
                #                              name='pred_weights')
                self.vars['weights'] = tf.get_variable(
                        'pred_weights', 
                        shape=(input_dim1, input_dim2),
                        dtype=tf.float32, 
                        initializer=tf.contrib.layers.xavier_initializer())

            if self.bias:
                self.vars['bias'] = zeros([self.output_dim], name='bias')

        if loss_fn == 'xent':
            self.loss_fn = self._xent_loss
        elif loss_fn == 'skipgram':
            self.loss_fn = self._skipgram_loss
        elif loss_fn == 'hinge':
            self.loss_fn = self._hinge_loss

        print("loss_fn ",loss_fn)
        
        if self.logging:
            self._log_vars()

    def affinity(self, inputs1, inputs2):
        """ Affinity score between batch of inputs1 and inputs2.
        Args:
            inputs1: tensor of shape [batch_size x feature_size].
        """
        # shape: [batch_size, input_dim1]
        if self.bilinear_weights:
            prod = tf.matmul(inputs2, tf.transpose(self.vars['weights']))
            self.prod = prod
            result = tf.reduce_sum(inputs1 * prod, axis=1)
        else:
            result = tf.reduce_sum(inputs1 * inputs2, axis=1)
        return result

    def neg_cost(self, inputs1, neg_samples, hard_neg_samples=None):
        """ For each input in batch, compute the sum of its affinity to negative samples.
        Returns:
            Tensor of shape [batch_size x num_neg_samples]. For each node, a list of affinities to
                negative samples is computed.
        """
        if self.bilinear_weights:
            inputs1 = tf.matmul(inputs1, self.vars['weights'])
        #neg_aff = tf.matmul(inputs1, tf.transpose(neg_samples))
        #neg_aff =  tf.reduce_sum(inputs1 * neg_samples, axis=1)
        
        
        shape = inputs1.shape
        dim = shape[1]
        
        print("orginal neg_samples shape ",neg_samples.shape)
        
        neg_samples = tf.reshape(neg_samples, [self.batch_size, self.neg_sample_size, dim])
        
        print("reshape1 neg_samples shape ",neg_samples.shape)
        
        neg_samples = tf.transpose(neg_samples, perm=[0, 2, 1])
        
        print("reshape2 neg_samples shape ",neg_samples.shape)
        
        print("orginal inputs1 shape ",inputs1.shape)
        
        inputs1 = tf.reshape(inputs1, [self.batch_size, 1, dim])
        
        temp_neg_aff = tf.matmul(inputs1, neg_samples)
        print("temp_neg_aff shape ",temp_neg_aff.shape)
        
        neg_aff = tf.squeeze(tf.matmul(inputs1, neg_samples),[1])
        
        print("neg_samples shape ",neg_samples.shape)
        print("inputs1 shape ",inputs1.shape)
        print("neg_aff shape ",neg_aff.shape)
        
        return neg_aff

    def loss(self, inputs1, inputs2, neg_samples):
        """ negative sampling loss.
        Args:
            neg_samples: tensor of shape [num_neg_samples x input_dim2]. Negative samples for all
            inputs in batch inputs1.
        """
        return self.loss_fn(inputs1, inputs2, neg_samples)

    def _xent_loss(self, inputs1, inputs2, neg_samples, hard_neg_samples=None):
        aff = self.affinity(inputs1, inputs2)
        
        neg_aff = self.neg_cost(inputs1, neg_samples, hard_neg_samples)
        
        true_xent = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf.ones_like(aff), logits=aff)
        negative_xent = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf.zeros_like(neg_aff), logits=neg_aff)
        
        loss = tf.reduce_sum(true_xent) + self.neg_sample_weights * tf.reduce_sum(negative_xent)
        return loss

    def _skipgram_loss(self, inputs1, inputs2, neg_samples, hard_neg_samples=None):
        aff = self.affinity(inputs1, inputs2)
        neg_aff = self.neg_cost(inputs1, neg_samples, hard_neg_samples)
        neg_cost = tf.log(tf.reduce_sum(tf.exp(neg_aff), axis=1))
        loss = tf.reduce_sum(aff - neg_cost)
        return loss

    def _hinge_loss(self, inputs1, inputs2, neg_samples, hard_neg_samples=None):
        aff = self.affinity(inputs1, inputs2)
        neg_aff = self.neg_cost(inputs1, neg_samples, hard_neg_samples)
        diff = tf.nn.relu(tf.subtract(neg_aff, tf.expand_dims(aff, 1) - self.margin), name='diff')
        loss = tf.reduce_sum(diff)
        self.neg_shape = tf.shape(neg_aff)
        return loss

    def weights_norm(self):
        return tf.nn.l2_norm(self.vars['weights'])
    
    
class UniformNeighborSampler(Layer):
    """
    Uniformly samples neighbors.
    Assumes that adj lists are padded with random re-sampling
    """
    def __init__(self, adj_info, **kwargs):
        super(UniformNeighborSampler, self).__init__(**kwargs)
        self.adj_info = adj_info

    def _call(self, inputs):
        ids, num_samples = inputs
        adj_lists = tf.nn.embedding_lookup(self.adj_info, ids) 
        adj_lists = tf.transpose(tf.random_shuffle(tf.transpose(adj_lists)))
        adj_lists = tf.slice(adj_lists, [0,0], [-1, num_samples])
        return adj_lists

    
class GCNAggregator(Layer):
    """
    Aggregates via mean followed by matmul and non-linearity.
    Same matmul parameters are used self vector and neighbor vectors.
    """

    def __init__(self, input_dim, output_dim, neigh_input_dim=None,
            dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs):
        super(GCNAggregator, self).__init__(**kwargs)

        self.dropout = dropout
        self.bias = bias
        self.act = act
        self.concat = concat

        if neigh_input_dim is None:
            neigh_input_dim = input_dim

        if name is not None:
            name = '/' + name
        else:
            name = ''

        with tf.variable_scope(self.name + name + '_vars'):
            #scope.reuse_variables()  
            
            self.vars['weights'] = glorot([neigh_input_dim, output_dim],
                                                        name='neigh_weights')
            if self.bias:
                self.vars['bias'] = zeros([self.output_dim], name='bias')

        if self.logging:
            self._log_vars()

        self.input_dim = input_dim
        self.output_dim = output_dim

    def _call(self, inputs):
        self_vecs, neigh_vecs = inputs

        neigh_vecs = tf.nn.dropout(neigh_vecs, 1-self.dropout)
        self_vecs = tf.nn.dropout(self_vecs, 1-self.dropout)
        means = tf.reduce_mean(tf.concat([neigh_vecs, 
            tf.expand_dims(self_vecs, axis=1)], axis=1), axis=1)
       
        # [nodes] x [out_dim]
        output = tf.matmul(means, self.vars['weights'])

        # bias
        if self.bias:
            output += self.vars['bias']
       
        return self.act(output)    
class MeanPoolingAggregator(Layer):
    """ Aggregates via mean-pooling over MLP functions.
    """
    def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None,
            dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs):
        super(MeanPoolingAggregator, self).__init__(**kwargs)

        self.dropout = dropout
        self.bias = bias
        self.act = act
        self.concat = concat

        if neigh_input_dim is None:
            neigh_input_dim = input_dim

        if name is not None:
            name = '/' + name
        else:
            name = ''

        if model_size == "small":
            hidden_dim = self.hidden_dim = 512
        elif model_size == "big":
            hidden_dim = self.hidden_dim = 1024

        self.mlp_layers = []
        self.mlp_layers.append(Dense(input_dim=neigh_input_dim,
                                 output_dim=hidden_dim,
                                 act=tf.nn.relu,
                                 dropout=dropout,
                                 sparse_inputs=False,
                                 logging=self.logging))

        with tf.variable_scope(self.name + name + '_vars'):
            self.vars['neigh_weights'] = glorot([hidden_dim, output_dim],
                                                        name='neigh_weights')
           
            self.vars['self_weights'] = glorot([input_dim, output_dim],
                                                        name='self_weights')
            if self.bias:
                self.vars['bias'] = zeros([self.output_dim], name='bias')

        if self.logging:
            self._log_vars()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.neigh_input_dim = neigh_input_dim

    def _call(self, inputs):
        self_vecs, neigh_vecs = inputs
        neigh_h = neigh_vecs

        dims = tf.shape(neigh_h)
        batch_size = dims[0]
        num_neighbors = dims[1]
        # [nodes * sampled neighbors] x [hidden_dim]
        h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim))

        for l in self.mlp_layers:
            h_reshaped = l(h_reshaped)
        neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim))
        neigh_h = tf.reduce_mean(neigh_h, axis=1)
        
        from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights'])
        from_self = tf.matmul(self_vecs, self.vars["self_weights"])
        
        if not self.concat:
            output = tf.add_n([from_self, from_neighs])
        else:
            output = tf.concat([from_self, from_neighs], axis=1)

        # bias
        if self.bias:
            output += self.vars['bias']
       
        return self.act(output)
    
class MeanAggregator(Layer):
    """
    Aggregates via mean followed by matmul and non-linearity.
    """

    def __init__(self, input_dim, output_dim, neigh_input_dim=None,
            dropout=0., bias=False, act=tf.nn.relu, 
            name=None, concat=False, **kwargs):
        super(MeanAggregator, self).__init__(**kwargs)

        self.dropout = dropout
        self.bias = bias
        self.act = act
        self.concat = concat

        if neigh_input_dim is None:
            neigh_input_dim = input_dim

        if name is not None:
            name = '/' + name
        else:
            name = ''

        with tf.variable_scope(self.name + name + '_vars'):
            self.vars['neigh_weights'] = glorot([neigh_input_dim, output_dim],
                                                        name='neigh_weights')
            self.vars['self_weights'] = glorot([input_dim, output_dim],
                                                        name='self_weights')
            if self.bias:
                self.vars['bias'] = zeros([self.output_dim], name='bias')

        if self.logging:
            self._log_vars()

        self.input_dim = input_dim
        self.output_dim = output_dim

    def _call(self, inputs):
        self_vecs, neigh_vecs = inputs

        neigh_vecs = tf.nn.dropout(neigh_vecs, 1-self.dropout)
        self_vecs = tf.nn.dropout(self_vecs, 1-self.dropout)
        neigh_means = tf.reduce_mean(neigh_vecs, axis=1)
       
        # [nodes] x [out_dim]
        from_neighs = tf.matmul(neigh_means, self.vars['neigh_weights'])

        from_self = tf.matmul(self_vecs, self.vars["self_weights"])
         
        if not self.concat:
            output = tf.add_n([from_self, from_neighs])
        else:
            output = tf.concat([from_self, from_neighs], axis=1)

        # bias
        if self.bias:
            output += self.vars['bias']
       
        return self.act(output)

class SeqAggregator(Layer):
    """ Aggregates via a standard LSTM.
    """
    def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None,
            dropout=0., bias=False, act=tf.nn.relu, name=None,  concat=False, **kwargs):
        super(SeqAggregator, self).__init__(**kwargs)

        self.dropout = dropout
        self.bias = bias
        self.act = act
        self.concat = concat

        if neigh_input_dim is None:
            neigh_input_dim = input_dim

        if name is not None:
            name = '/' + name
        else:
            name = ''

        if model_size == "small":
            hidden_dim = self.hidden_dim = 128
        elif model_size == "big":
            hidden_dim = self.hidden_dim = 256

        with tf.variable_scope(self.name + name + '_vars'):
            self.vars['neigh_weights'] = glorot([hidden_dim, output_dim],
                                                        name='neigh_weights')
           
            self.vars['self_weights'] = glorot([input_dim, output_dim],
                                                        name='self_weights')
            if self.bias:
                self.vars['bias'] = zeros([self.output_dim], name='bias')

        if self.logging:
            self._log_vars()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.neigh_input_dim = neigh_input_dim
        self.cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim)

    def _call(self, inputs):
        self_vecs, neigh_vecs = inputs

        dims = tf.shape(neigh_vecs)
        batch_size = dims[0]
        initial_state = self.cell.zero_state(batch_size, tf.float32)
        used = tf.sign(tf.reduce_max(tf.abs(neigh_vecs), axis=2))
        length = tf.reduce_sum(used, axis=1)
        length = tf.maximum(length, tf.constant(1.))
        length = tf.cast(length, tf.int32)

        with tf.variable_scope(self.name) as scope:
            try:
                rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
                        self.cell, neigh_vecs,
                        initial_state=initial_state, dtype=tf.float32, time_major=False,
                        sequence_length=length)
            except ValueError:
                scope.reuse_variables()
                rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
                        self.cell, neigh_vecs,
                        initial_state=initial_state, dtype=tf.float32, time_major=False,
                        sequence_length=length)
        batch_size = tf.shape(rnn_outputs)[0]
        max_len = tf.shape(rnn_outputs)[1]
        out_size = int(rnn_outputs.get_shape()[2])
        index = tf.range(0, batch_size) * max_len + (length - 1)
        flat = tf.reshape(rnn_outputs, [-1, out_size])
        neigh_h = tf.gather(flat, index)

        from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights'])
        from_self = tf.matmul(self_vecs, self.vars["self_weights"])
         
        output = tf.add_n([from_self, from_neighs])

        if not self.concat:
            output = tf.add_n([from_self, from_neighs])
        else:
            output = tf.concat([from_self, from_neighs], axis=1)

        # bias
        if self.bias:
            output += self.vars['bias']
       
        return self.act(output)
    
class MaxPoolingAggregator(Layer):
    """ Aggregates via max-pooling over MLP functions.
    """
    def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None,
            dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs):
        super(MaxPoolingAggregator, self).__init__(**kwargs)

        self.dropout = dropout
        self.bias = bias
        self.act = act
        self.concat = concat

        if neigh_input_dim is None:
            neigh_input_dim = input_dim

        if name is not None:
            name = '/' + name
        else:
            name = ''

        if model_size == "small":
            hidden_dim = self.hidden_dim = 512
        elif model_size == "big":
            hidden_dim = self.hidden_dim = 1024

        self.mlp_layers = []
        self.mlp_layers.append(Dense(input_dim=neigh_input_dim,
                                 output_dim=hidden_dim,
                                 act=tf.nn.relu,
                                 dropout=dropout,
                                 sparse_inputs=False,
                                 logging=self.logging))

        with tf.variable_scope(self.name + name + '_vars'):
            self.vars['neigh_weights'] = glorot([hidden_dim, output_dim],
                                                        name='neigh_weights')
           
            self.vars['self_weights'] = glorot([input_dim, output_dim],
                                                        name='self_weights')
            if self.bias:
                self.vars['bias'] = zeros([self.output_dim], name='bias')

        if self.logging:
            self._log_vars()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.neigh_input_dim = neigh_input_dim

    def _call(self, inputs):
        self_vecs, neigh_vecs = inputs
        neigh_h = neigh_vecs

        dims = tf.shape(neigh_h)
        batch_size = dims[0]
        num_neighbors = dims[1]
        # [nodes * sampled neighbors] x [hidden_dim]
        h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim))

        for l in self.mlp_layers:
            h_reshaped = l(h_reshaped)
        neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim))
        neigh_h = tf.reduce_max(neigh_h, axis=1)
        
        from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights'])
        from_self = tf.matmul(self_vecs, self.vars["self_weights"])
        
        if not self.concat:
            output = tf.add_n([from_self, from_neighs])
        else:
            output = tf.concat([from_self, from_neighs], axis=1)

        # bias
        if self.bias:
            output += self.vars['bias']
       
        return self.act(output)
    
class SampleAndAggregate(GeneralizedModel):
    """
    Base implementation of unsupervised GraphSAGE
    """

    def __init__(self, placeholders, features, adj, degrees,
            layer_infos, concat=True, aggregator_type="mean", 
            model_size="small", identity_dim=0,
            **kwargs):
        '''
        Args:
            - placeholders: Stanford TensorFlow placeholder object.
            - features: Numpy array with node features. 
                        NOTE: Pass a None object to train in featureless mode (identity features for nodes)!
            - adj: Numpy array with adjacency lists (padded with random re-samples)
            - degrees: Numpy array with node degrees. 
            - layer_infos: List of SAGEInfo namedtuples that describe the parameters of all 
                   the recursive layers. See SAGEInfo definition above.
            - concat: whether to concatenate during recursive iterations
            - aggregator_type: how to aggregate neighbor information
            - model_size: one of "small" and "big"
            - identity_dim: Set to positive int to use identity features (slow and cannot generalize, but better accuracy)
        '''
        super(SampleAndAggregate, self).__init__(**kwargs)
        if aggregator_type == "mean":
            self.aggregator_cls = MeanAggregator
        elif aggregator_type == "seq":
            self.aggregator_cls = SeqAggregator
        elif aggregator_type == "maxpool":
            self.aggregator_cls = MaxPoolingAggregator
        elif aggregator_type == "meanpool":
            self.aggregator_cls = MeanPoolingAggregator
        elif aggregator_type == "gcn":
            self.aggregator_cls = GCNAggregator
        else:
            raise Exception("Unknown aggregator: ", self.aggregator_cls)

        # get info from placeholders...
        self.inputs1 = placeholders["batch1"]
        self.inputs2 = placeholders["batch2"]
        self.neg_samples = placeholders["neg_samples"]
        self.neg_sample_size = placeholders["neg_sample_size"]
        
        self.model_size = model_size
        self.adj_info = adj
        #self.sample_table = sample_table
        
        if identity_dim > 0:
            #print("tf.global_variables() ",tf.global_variables())
            tmp = [v for v in tf.global_variables() if v.name == "node_embeddings:0"]
            print("tmp ",tmp)
            print("adj shape ",adj.get_shape().as_list()[0])
            if tmp:
                self.embeds = tmp[0]
            else:
                print("tmp is None ")
                #self.embeds = tf.get_variable("node_embeddings", [adj.get_shape().as_list()[0], identity_dim])
                print("(adj.get_shape().as_list()[0] ",(adj.get_shape().as_list()[0]))
                self.embeds = tf.get_variable("node_embeddings", initializer= tf.eye(adj.get_shape().as_list()[0], num_columns = identity_dim, dtype=tf.float32) )
                #self.embeds = tf.get_variable("node_embeddings", initializer= tf.ones((adj.get_shape().as_list()[0], identity_dim), dtype=tf.float32) )
                #self.embeds = tf.get_variable("node_embeddings", initializer= tf.eye(adj.get_shape().as_list()[0], dtype=tf.float32) )
                #self.embeds = adj.get_shape().as_list()[0]
                #tf.Variable(initial_value = np.identity(784))
                
        else:
           self.embeds = None
        if features is None: 
            if identity_dim == 0:
                raise Exception("Must have a positive value for identity feature dimension if no input features given.")
            self.features = self.embeds
            print("self.features ",self.features)
        else:
            self.features = tf.Variable(tf.constant(features, dtype=tf.float32), trainable=False)
            if not self.embeds is None:
                self.features = tf.concat([self.embeds, self.features], axis=1)
        self.degrees = degrees
        self.concat = concat

        self.dims = [(0 if features is None else features.shape[1]) + identity_dim]
        self.dims.extend([layer_infos[i].output_dim for i in range(len(layer_infos))])
        #self.neg_sample_size = placeholders["neg_sample_size"]
        self.batch_size = placeholders["batch_size"]
        self.placeholders = placeholders
        self.layer_infos = layer_infos
        
        #self.neg_sample_size = 20

        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)

        self.build()

    def sample(self, inputs, layer_infos, batch_size=None):
        """ Sample neighbors to be the supportive fields for multi-layer convolutions.
        Args:
            inputs: batch inputs
            batch_size: the number of inputs (different for batch inputs and negative samples).
        """
        
        if batch_size is None:
            batch_size = self.batch_size
        samples = [inputs]
        # size of convolution support at each layer per node
        support_size = 1
        support_sizes = [support_size]
        for k in range(len(layer_infos)):
            t = len(layer_infos) - k - 1
            support_size *= layer_infos[t].num_samples
            sampler = layer_infos[t].neigh_sampler
            node = sampler((samples[k], layer_infos[t].num_samples))
            samples.append(tf.reshape(node, [support_size * batch_size,]))
            support_sizes.append(support_size)
        return samples, support_sizes


    def aggregate(self, samples, input_features, dims, num_samples, support_sizes, batch_size=None,
            aggregators=None, name=None, concat=False, model_size="small"):
        """ At each layer, aggregate hidden representations of neighbors to compute the hidden representations 
            at next layer.
        Args:
            samples: a list of samples of variable hops away for convolving at each layer of the
                network. Length is the number of layers + 1. Each is a vector of node indices.
            input_features: the input features for each sample of various hops away.
            dims: a list of dimensions of the hidden representations from the input layer to the
                final layer. Length is the number of layers + 1.
            num_samples: list of number of samples for each layer.
            support_sizes: the number of nodes to gather information from for each layer.
            batch_size: the number of inputs (different for batch inputs and negative samples).
        Returns:
            The hidden representation at the final layer for all nodes in batch
        """

        if batch_size is None:
            batch_size = self.batch_size

        # length: number of layers + 1
        hidden = [tf.nn.embedding_lookup(input_features, node_samples) for node_samples in samples]
        
#         tmp_array_dim = input_features[0]
#         tmp_array_row = len(samples)
#         print("samples len: ", len(samples))
#         print("samples : ", samples)
        
#         hidden = np.zeros((tmp_array_row,tmp_array_dim))
#         for i, node_samples in enumerate(samples):
#             hidden[i,node_samples] = 1
        #print(hidden)
        
        
        new_agg = aggregators is None
        if new_agg:
            aggregators = []
        for layer in range(len(num_samples)):
            if new_agg:
                dim_mult = 2 if concat and (layer != 0) else 1
                # aggregator at current layer
                if layer == len(num_samples) - 1:
                    aggregator = self.aggregator_cls(dim_mult*dims[layer], dims[layer+1], act=lambda x : x,
                            dropout=self.placeholders['dropout'], 
                            name=name, concat=concat, model_size=model_size)
                else:
                    aggregator = self.aggregator_cls(dim_mult*dims[layer], dims[layer+1],
                            dropout=self.placeholders['dropout'], 
                            name=name, concat=concat, model_size=model_size)
                aggregators.append(aggregator)
            else:
                aggregator = aggregators[layer]
            # hidden representation at current layer for all support nodes that are various hops away
            next_hidden = []
            # as layer increases, the number of support nodes needed decreases
            for hop in range(len(num_samples) - layer):
                dim_mult = 2 if concat and (layer != 0) else 1
                neigh_dims = [batch_size * support_sizes[hop], 
                              num_samples[len(num_samples) - hop - 1], 
                              dim_mult*dims[layer]]
                h = aggregator((hidden[hop],
                                tf.reshape(hidden[hop + 1], neigh_dims)))
                next_hidden.append(h)
            hidden = next_hidden
        return hidden[0], aggregators

    def _build(self):
        
#         labels = tf.reshape(
#                 tf.cast(self.placeholders['batch2'], dtype=tf.int64),
#                 [self.batch_size, 1])
        
#         self.neg_samples, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
#             true_classes=labels,
#             num_true=1,
#             num_sampled=self.neg_sample_size,
#             unique=False,
#             range_max=len(self.degrees),
#             distortion=0.75,
#             unigrams=self.degrees.tolist()))
    
      
        
#         self.neg_samples = []
#         elems = (self.inputs1, self.inputs2)
        
#         tf.map_fn(lambda x: x[0] * x[1], elems)
            
            
#         for i,j in zip(labels, labels):
#             tmp_sample = self.sample_table(1)[0]
#             while(tmp_sample == i or tmp_sample == j):
#                 tmp_sample = self.sample_table(1)[0]
#             self.neg_samples.append(tmp_sample)
           
        # perform "convolution"
        samples1, support_sizes1 = self.sample(self.inputs1, self.layer_infos)
        samples2, support_sizes2 = self.sample(self.inputs2, self.layer_infos)
        num_samples = [layer_info.num_samples for layer_info in self.layer_infos]
        self.outputs1, self.aggregators = self.aggregate(samples1, [self.features], self.dims, num_samples,
                support_sizes1, concat=self.concat, model_size=self.model_size)
        self.outputs2, _ = self.aggregate(samples2, [self.features], self.dims, num_samples,
                support_sizes2, aggregators=self.aggregators, concat=self.concat,
                model_size=self.model_size)

        neg_samples, neg_support_sizes = self.sample(self.neg_samples, self.layer_infos,
            (self.batch_size*self.neg_sample_size))
        self.neg_outputs, _ = self.aggregate(neg_samples, [self.features], self.dims, num_samples,
                neg_support_sizes, batch_size=(self.batch_size*self.neg_sample_size), aggregators=self.aggregators,
                concat=self.concat, model_size=self.model_size)

        dim_mult = 2 if self.concat else 1
        self.link_pred_layer = BipartiteEdgePredLayer(dim_mult*self.dims[-1],
                dim_mult*self.dims[-1], self.placeholders, act=tf.nn.sigmoid, 
                bilinear_weights=False,
                name='edge_predict')

        self.outputs1 = tf.nn.l2_normalize(self.outputs1, 1)
        self.outputs2 = tf.nn.l2_normalize(self.outputs2, 1)
        self.neg_outputs = tf.nn.l2_normalize(self.neg_outputs, 1)

    def build(self):
        self._build()

        # TF graph management
        self._loss()
        self._accuracy()
        self.loss = self.loss / tf.cast(self.batch_size, tf.float32)
        grads_and_vars = self.optimizer.compute_gradients(self.loss)
        clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var) 
                for grad, var in grads_and_vars]
        self.grad, _ = clipped_grads_and_vars[0]
        self.opt_op = self.optimizer.apply_gradients(clipped_grads_and_vars)

    def _loss(self):
        for aggregator in self.aggregators:
            for var in aggregator.vars.values():
                self.loss += weight_decay * tf.nn.l2_loss(var)

        self.loss += self.link_pred_layer.loss(self.outputs1, self.outputs2, self.neg_outputs) 
        tf.summary.scalar('loss', self.loss)

    def _accuracy(self):
        # shape: [batch_size]
#         aff = self.link_pred_layer.affinity(self.outputs1, self.outputs2)
        
#         # shape : [batch_size x num_neg_samples]
#         self.neg_aff = self.link_pred_layer.neg_cost(self.outputs1, self.neg_outputs)
#         self.neg_aff = tf.reshape(self.neg_aff, [self.batch_size, self.neg_sample_size])
        
#         _aff = tf.expand_dims(aff, axis=1)
#         self.aff_all = tf.concat(axis=1, values=[self.neg_aff, _aff])
#         size = tf.shape(self.aff_all)[1]
#         _, indices_of_ranks = tf.nn.top_k(self.aff_all, k=size)
#         _, self.ranks = tf.nn.top_k(-indices_of_ranks, k=size)
#         self.mrr = tf.reduce_mean(tf.div(1.0, tf.cast(self.ranks[:, -1] + 1, tf.float32)))
#         tf.summary.scalar('mrr', self.mrr)
        self.mrr = 0.0
        tf.summary.scalar('mrr', self.mrr)

In [8]:
t1 = time.time()
g = Graph()
print("Reading...")



a_input = './../Tweet_edgelist.txt'

a_weighted = False
directed = False
g.read_edgelist(filename=a_input, weighted=a_weighted,
                        directed=directed)



feature_file = './../Tweet_featurelist.txt'
g.read_node_features(feature_file)

dropout = 0.5
weight_decay = 5e-4
hidden = 16
epochs = 200

Reading...


In [9]:
context_pairs = list(g.G.edges())

def construct_placeholders():
    # Define placeholders
    placeholders = {
        'batch1' : tf.placeholder(tf.int32, shape=(None), name='batch1'),
        'batch2' : tf.placeholder(tf.int32, shape=(None), name='batch2'),
        # negative samples for all nodes in the batch
        'neg_samples': tf.placeholder(tf.int32, shape=(None), name='neg_samples'),
        'neg_sample_size': tf.placeholder(tf.int32, name='neg_sample_size'),
        'dropout': tf.placeholder_with_default(0., shape=(), name='dropout'),
        'batch_size' : tf.placeholder(tf.int32, name='batch_size'),
    }
    return placeholders

placeholders = construct_placeholders()
max_degree = 100
adj = len(g.look_up_dict)*np.ones((len(g.look_up_dict)+1, max_degree))
deg = np.zeros((len(g.look_up_dict),))
print("adj.shape : ",adj.shape)
print("deg.shape : ",deg.shape)

adj.shape :  (5099, 100)
deg.shape :  (5098,)


In [10]:
for nodeid in g.G.nodes():
    neighbors = np.array([g.look_up_dict[neighbor] for neighbor in g.G.neighbors(nodeid)])
    deg[g.look_up_dict[nodeid]] = len(neighbors)
    if len(neighbors) == 0:
        continue
    if len(neighbors) > max_degree:
        neighbors = np.random.choice(neighbors, max_degree, replace=False)
    elif len(neighbors) < max_degree:
        neighbors = np.random.choice(neighbors, max_degree, replace=True)
        
    adj[g.look_up_dict[nodeid], :] = neighbors    

In [11]:
adj_info_ph = tf.placeholder(tf.int32, shape=adj.shape)
adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

SAGEInfo = namedtuple("SAGEInfo",
    ['layer_name', # name of the layer (to get feature embedding etc.)
     'neigh_sampler', # callable neigh_sampler constructor
     'num_samples',
     'output_dim' # the output (i.e., hidden) dimension
    ])

samples_1 = 25
samples_2 = 10

#samples_1 = 10
#samples_2 = 5


dim_1 = 128
dim_2 = 128

sampler = UniformNeighborSampler(adj_info)
layer_infos = [SAGEInfo("node", sampler, samples_1, dim_1),
            SAGEInfo("node", sampler, samples_2, dim_2)]

#features = None
features = list(nx.get_node_attributes(g.G,'feature').values())
features = np.asarray(features)
features = np.vstack([features, np.zeros((features.shape[1],))])

model_size = "small"
identity_dim = adj_info.get_shape().as_list()[0]

In [12]:
identity_dim = 0

In [13]:
model = SampleAndAggregate(placeholders, 
                             features,
                             adj_info,
                             deg,
                             layer_infos=layer_infos, 
                             aggregator_type="gcn",
                             model_size=model_size,
                             identity_dim = identity_dim,
                             concat=False,
                             logging=True)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
self.neg_sample_size  Tensor("neg_sample_size:0", dtype=int32)
self.batch_size  Tensor("batch_size:0", dtype=int32)
loss_fn  xent
orginal neg_samples shape  (?, 128)
reshape1 neg_samples shape  (?, ?, 128)
reshape2 neg_samples shape  (?, 128, ?)
orginal inputs1 shape  (?, 128)
temp_neg_aff shape  (?, 1, ?)
neg_samples shape  (?, 128, ?)
inputs1 shape  (?, 1, 128)
neg_aff shape  (?, ?)
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [14]:
table = UnigramTable(deg)

Filling unigram table
Finish filling unigram table


In [15]:
features.shape

(5099, 89)

In [16]:
config = tf.ConfigProto(log_device_placement=False)
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
sess = tf.Session(config=config)

sess.run(tf.global_variables_initializer(),
         feed_dict={adj_info_ph: adj})

In [17]:
total_steps = 0
epochs = 20

batch_size= 200
avg_time = 0.0
each_node_neg_samples = 5
total_len = len(context_pairs)

for epoch in range(epochs): 
    train_edges = edges = np.random.permutation(context_pairs)
    batch_num = 0
    iter = 0
    print('Epoch: %04d' % (epoch + 1))

    while not (batch_num * batch_size >= len(train_edges)):

        start_idx = batch_num * batch_size
        batch_num += 1
        end_idx = min(start_idx + batch_size, len(train_edges))
        batch_nodes = train_edges[start_idx : end_idx]

        batch1 = []
        batch2 = []
        neg_samples = []
        
        for node1, node2 in batch_nodes:
            
            seq_node1 = g.look_up_dict[node1]
            seq_node2 = g.look_up_dict[node2]
            
            batch1.append(seq_node1)
            batch2.append(seq_node2)
            
            
            for i in np.arange(each_node_neg_samples):
                tmp_sample = table.sample(1)[0]
                while tmp_sample == seq_node1 or tmp_sample == seq_node2:
                    tmp_sample = table.sample(1)[0]
                neg_samples.append(tmp_sample)

        feed_dict = dict()
        feed_dict.update({placeholders['batch_size'] : len(batch_nodes)})
        feed_dict.update({placeholders['batch1']: batch1})
        feed_dict.update({placeholders['batch2']: batch2})
        feed_dict.update({placeholders['neg_samples']: neg_samples})
        feed_dict.update({placeholders['neg_sample_size']: each_node_neg_samples})
        
        #print(feed_dict)
        #print()
        
        feed_dict.update({placeholders['dropout']: dropout})

        t = time.time()
        outs = sess.run([model.opt_op, model.loss,
                         model.outputs1, model.outputs2, model.neg_outputs], feed_dict=feed_dict)   

        train_cost = outs[1]
        #train_mrr = outs[2]

        avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1)
        
        if iter%50 == 0:
            print("Iter:", '%04d' % iter, 
                              "pairs:", "{:.2f}".format(iter*batch_size/total_len*100), 
                              "train_loss=", "{:.5f}".format(train_cost),
                              "time=", "{:.5f}".format(avg_time))

        iter += 1
        total_steps += 1
        

Epoch: 0001
Iter: 0000 pairs: 0.00 train_loss= 5.52921 time= 1.48445
Iter: 0050 pairs: 9.01 train_loss= 5.46609 time= 0.04968
Iter: 0100 pairs: 18.01 train_loss= 5.40103 time= 0.03554
Iter: 0150 pairs: 27.02 train_loss= 5.29486 time= 0.03072
Iter: 0200 pairs: 36.03 train_loss= 5.18134 time= 0.02825
Iter: 0250 pairs: 45.03 train_loss= 5.09701 time= 0.02679
Iter: 0300 pairs: 54.04 train_loss= 5.00796 time= 0.02583
Iter: 0350 pairs: 63.05 train_loss= 4.84629 time= 0.02516
Iter: 0400 pairs: 72.05 train_loss= 4.79254 time= 0.02462
Iter: 0450 pairs: 81.06 train_loss= 4.68321 time= 0.02423
Iter: 0500 pairs: 90.07 train_loss= 4.61659 time= 0.02393
Iter: 0550 pairs: 99.07 train_loss= 4.54174 time= 0.02368
Epoch: 0002
Iter: 0000 pairs: 0.00 train_loss= 4.56243 time= 0.02363
Iter: 0050 pairs: 9.01 train_loss= 4.47617 time= 0.02341
Iter: 0100 pairs: 18.01 train_loss= 4.40499 time= 0.02324
Iter: 0150 pairs: 27.02 train_loss= 4.35014 time= 0.02308
Iter: 0200 pairs: 36.03 train_loss= 4.30525 time= 0.

Iter: 0400 pairs: 72.05 train_loss= 4.03508 time= 0.02112
Iter: 0450 pairs: 81.06 train_loss= 4.03134 time= 0.02111
Iter: 0500 pairs: 90.07 train_loss= 4.01745 time= 0.02111
Iter: 0550 pairs: 99.07 train_loss= 4.05952 time= 0.02111
Epoch: 0013
Iter: 0000 pairs: 0.00 train_loss= 4.02696 time= 0.02111
Iter: 0050 pairs: 9.01 train_loss= 4.05854 time= 0.02111
Iter: 0100 pairs: 18.01 train_loss= 4.06707 time= 0.02111
Iter: 0150 pairs: 27.02 train_loss= 4.01103 time= 0.02110
Iter: 0200 pairs: 36.03 train_loss= 4.08762 time= 0.02110
Iter: 0250 pairs: 45.03 train_loss= 4.04382 time= 0.02110
Iter: 0300 pairs: 54.04 train_loss= 4.03828 time= 0.02110
Iter: 0350 pairs: 63.05 train_loss= 4.03300 time= 0.02110
Iter: 0400 pairs: 72.05 train_loss= 4.05456 time= 0.02110
Iter: 0450 pairs: 81.06 train_loss= 4.03315 time= 0.02110
Iter: 0500 pairs: 90.07 train_loss= 4.03373 time= 0.02110
Iter: 0550 pairs: 99.07 train_loss= 4.04827 time= 0.02109
Epoch: 0014
Iter: 0000 pairs: 0.00 train_loss= 4.03530 time= 0

In [18]:
def acc(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    from sklearn.utils.linear_assignment_ import linear_assignment
    ind = linear_assignment(w.max() - w)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size

In [19]:
val_embeddings = []
finished = False
seen = set([])
nodes = []
iter_num = 0
name = "val"
size = batch_size
node_list = list(g.G.nodes())
re_order_vectors = np.zeros((len(node_list),dim_1))

while not finished:
    
    val_nodes = node_list[iter_num*size:min((iter_num+1)*size, len(node_list))]
    val_edges = [(n,n) for n in val_nodes]

    batch1 = []
    batch2 = []
    neg_samples = []
    
    for node1, node2 in val_edges:
        seq_node1 = g.look_up_dict[node1]
        seq_node2 = g.look_up_dict[node2]

        batch1.append(seq_node1)
        batch2.append(seq_node2)
        tmp_sample = table.sample(1)[0]
        while tmp_sample == seq_node1 or tmp_sample == seq_node2:
            tmp_sample = table.sample(1)[0]
        neg_samples.append(tmp_sample)
     
    feed_dict = dict()
    feed_dict.update({placeholders['batch_size'] : len(val_edges)})
    feed_dict.update({placeholders['batch1']: batch1})
    feed_dict.update({placeholders['batch2']: batch2})
    
    
    feed_dict.update({placeholders['neg_samples']: neg_samples})
    feed_dict.update({placeholders['neg_sample_size']: 1})
        
    feed_dict.update({placeholders['dropout']: dropout})

    feed_dict_val, finished, edges = feed_dict, (iter_num+1)*size >= len(node_list), val_nodes


    iter_num += 1
    outs_val = sess.run([model.loss, model.outputs1], 
                        feed_dict=feed_dict_val)
    

    
    for i, edge in enumerate(edges):
        #print(type(edge))  
        if not edge in seen:
            val_embeddings.append(outs_val[-1][i,:])
            nodes.append(edge)
            seen.add(edge)
            re_order_vectors[int(edge)] = outs_val[-1][i,:]
            
val_embeddings = np.vstack(val_embeddings)

In [20]:
filename = "./../Tweet_dictionary.txt"
word_id = {}
with open(filename,'r')as datafile:
    sents = datafile.readlines()
    for data in sents:
        word_id[data.split(" ")[0]] = int(data.split(" ")[1].strip())
datafile.close()

import numpy as np
total_doc_num = 2472
doc_emb = np.zeros((total_doc_num,dim_1))
doc_label = np.zeros((total_doc_num,),dtype=int)
import re
filename = "./../data/Tweet"
doc_num = 0
with open(filename,'r')as datafile:
        sents = datafile.readlines()
        punc = '",:'
        for data in sents:
            
            data = re.sub(r'[{}]+'.format(punc),'',data)
            raw_text = data.split(' ')[1:-2]
            
            doc_label[doc_num] = int(data.split(' ')[-1].strip().replace("}",""))
            for data_i in raw_text:
                doc_emb[doc_num] += re_order_vectors[word_id[data_i]]
            doc_emb[doc_num] /= len(raw_text)  
            #doc_emb[doc_num]
            doc_num += 1
datafile.close()            

In [21]:
from sklearn.cluster import KMeans

In [22]:
kmeans = KMeans(n_clusters=89,  max_iter=100).fit(doc_emb)

In [23]:
print(kmeans.labels_.shape)
print(doc_label.shape)

(2472,)
(2472,)


In [24]:
from sklearn.metrics.cluster import normalized_mutual_info_score

In [25]:
print(normalized_mutual_info_score(np.array(doc_label), np.array(kmeans.labels_)))
print(acc(np.array(doc_label), np.array(kmeans.labels_)))

0.8198505232518324
0.5570388349514563


In [26]:
avg_nmi = []
avg_acc = []
for i in range(20):
    kmeans = KMeans(n_clusters=89,  max_iter=100).fit(doc_emb)
    avg_nmi.append(normalized_mutual_info_score(np.array(doc_label), np.array(kmeans.labels_)))
    avg_acc.append(acc(np.array(doc_label), np.array(kmeans.labels_)))
print("avg_nmi: ", np.mean(avg_nmi))
print("avg_acc: ",np.mean(avg_acc))

avg_nmi:  0.8150889401944876
avg_acc:  0.5509911003236246


In [27]:
# storefile = 'emb/Tweet_graphsage_emb.txt'
# sf = open(storefile,'w')
# for i in range(doc_emb.shape[0]):
#     sf.write(str(i)+" "+" ".join([str(ele) for ele in doc_emb[i]])+'\n')
# sf.close()