In [59]:
import numpy as np
import pickle as pkl
import scipy.sparse as sp
import networkx as nx

In [60]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

In [61]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

In [83]:
def load_data(dataset_str):
    """
    Loads input data from gcn/data directory
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
    All objects above must be saved using python pickle module.
    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        print(i)
        with open(f"../data/{dataset_str}/{names[i]}", 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file(f"../data/{dataset_str}/test.index")
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'linqs_citeseer_planetoid':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]
    return adj, features, allx, ally, y_train, y_val, y_test, labels

In [84]:
dataset_str = 'linqs_citeseer_planetoid'
adj, features, allx, ally, y_train, y_val, y_test, labels = load_data(dataset_str)

0
1
2
3
4
5
6


In [66]:
import os
data_path = 'linqs_cora_planetoid'
x_path = os.path.abspath(os.path.join( '..', f'data/{data_path}/features.csv'))

In [68]:
features = np.loadtxt(x_path, delimiter=',')

In [86]:
labels.shape

(3327, 6)

In [71]:
# np.savetxt(f"../data/{dataset_str}/features_raw.csv",features.toarray(),delimiter=',')

In [73]:
# with open(f"../data/linqs_pubmed_planetoid/ind.pubmed.y", 'rb') as f:
#     y = pkl.load(f, encoding='latin1')

In [74]:
# pkl.dump(y,open(f"../data/linqs_pubmed_planetoid/y", 'wb'))

In [75]:
graph = adj

In [77]:
# np.savetxt(f"../data/{dataset_str}/graph_symmetric.csv",np.vstack([graph.tocoo().row,graph.tocoo().col]).T,delimiter=',')

In [87]:
true_labels = labels
graph = adj
labeled_indices = np.where(np.sum(y_train,axis=1))[0]
unlabeled_indices = np.where(1-np.sum(y_train,axis=1))[0]
validation_indices = np.where(np.sum(y_val,axis=1))[0]
test_indices = np.where(np.sum(y_test,axis=1))[0]

In [88]:
np.argmax(true_labels[unlabeled_indices],axis=1)

array([3, 1, 2, ..., 3, 1, 5])

In [43]:
len(unlabeled_indices),len(labeled_indices)

(2568, 140)

In [44]:
seed = 1
np.random.seed(seed)
num_nodes, num_classes = true_labels.shape
num_unlabeled = unlabeled_indices.shape[0]

In [45]:
# from each class, sample at least one index for labeled
labeled_indices_from_class = []
for class_id in range(num_classes):
    labeled_indices_from_class.append(
        np.random.choice(np.where(true_labels[:, class_id])[0]))

# sample indices to unlabel
indices_left = [
    i for i in range(num_nodes) if i not in labeled_indices_from_class
]
unlabeled_indices = np.random.choice(
    indices_left, num_unlabeled, replace=False)
unlabeled_indices = np.array(sorted(unlabeled_indices))
labeled_indices = np.delete(np.arange(num_nodes), unlabeled_indices)

In [52]:
labeled_indices = np.where(np.sum(y_train,axis=1))[0]
unlabeled_indices = np.where(1-np.sum(y_train,axis=1))[0]
test_val_indices = np.random.choice(unlabeled_indices,len(validation_indices)+len(test_indices))
validation_indices = test_val_indices[:len(validation_indices)]
test_indices = test_val_indices[len(validation_indices):]

In [1]:
from __future__ import print_function
from pathlib import Path
from random import shuffle
import random

import argparse
import copy
import numpy as np
import sys
import tensorflow as tf
import scipy.sparse as sp
import sys

import sys
sys.path.append("..") 
from deeplp.models.deeplp_att import DeepLP_ATT
from deeplp.models.deeplp_edge import DeepLP_Edge
from deeplp.models.deeplp_wrbf import DeepLP_WRBF
from deeplp.models.lp import LP
from deeplp.utils import (calc_masks, create_seed_features, load_data, load_and_prepare_planetoid_data,
                          num_layers_dict, prepare_data, random_unlabel)


  from ._conv import register_converters as _register_converters


In [2]:
true_labels, features, raw_features, graph, labeled_indices, unlabeled_indices, target_indices, gcc_indices, nogcc_indices = load_and_prepare_planetoid_data('linqs_citeseer_planetoid')

0
1
2
3
4
5
6
fixed seed


In [8]:
np.sum(features,axis=0)

array([          nan, 7146.26703733,  126.08768604, 4735.3328956 ,
       4622.26686951, 4392.64124216, 2527.26719364, 2527.26719364,
        554.57142857, 2494.        ,  804.        , 6152.07685339,
       6152.07685339, 6152.07685339,  710.523945  , 7664.28696897])

In [10]:
G = nx.from_scipy_sparse_matrix(graph)

In [11]:
import networkx as nx
max(nx.connected_component_subgraphs(G), key=len)

<networkx.classes.graph.Graph at 0x12bb7d898>

In [15]:
gcc_indices = set(max(nx.connected_component_subgraphs(G), key=len).nodes())

In [17]:
num_nodes, num_classes = true_labels.shape

In [4]:
labels, is_labeled = calc_masks(true_labels, labeled_indices,
                                unlabeled_indices)

In [5]:
seed_features = create_seed_features(graph, labeled_indices, true_labels)

TypeError: only integer scalar arrays can be converted to a scalar index

In [32]:
1/np.inf

0.0

In [33]:
    import networkx as nx
    U = nx.from_scipy_sparse_matrix(graph)  # undirected
    B = U.to_directed()
    edges = np.array(B.edges())
    sources, sinks = edges[:, 0], edges[:, 1]
    subU_nodes = list(max(nx.connected_component_subgraphs(U), key=len).nodes())
    subU = U.subgraph(subU_nodes)
    in_subU = (np.in1d(sources,list(subU_nodes))) & (np.in1d(sinks,list(subU_nodes)))
    num_connected = len(subU_nodes)

    # calculate shortest path length to each seed node
    seed_to_node_lengths = []  # num_labeled * num_nodes matrix
    for i in labeled_indices:
        shortest_paths_seed = nx.shortest_path_length(B, source=int(i))
        path_lengths = [i[1] for i in sorted(shortest_paths_seed.items())]
        if len(path_lengths) != num_connected:
            path_lengths = list(np.repeat(np.inf,num_connected))
        seed_to_node_lengths.append(path_lengths)
    seed_to_node_lengths = np.array(seed_to_node_lengths)

    # create label => list of seed indices dict
    labels_for_seeds = np.argmax(true_labels[labeled_indices], axis=1)
    labels_for_seeds_dict = {}
    for i, label in enumerate(labels_for_seeds):
        if label in labels_for_seeds_dict:
            labels_for_seeds_dict[label].append(i)
        else:
            labels_for_seeds_dict[label] = [i]

    # for each label, find the closest (or average) distance to
    # seed with that label
    seed_features = []

In [42]:
inv_seed_to_node_lengths = 1 / (seed_to_node_lengths + 1)

In [43]:
for label in labels_for_seeds_dict:
    indices = labels_for_seeds_dict[label]
    label_inv_seed_to_node_lengths = inv_seed_to_node_lengths[indices]
    label_max_len_to_seed = np.zeros(len(B.nodes()))
    label_max_len_to_seed[subU_nodes] = np.max(
        label_inv_seed_to_node_lengths, axis=0)
    label_max_len_to_seed = label_max_len_to_seed[sources]
    
    
    label_mean_len_to_seed = np.zeros(len(B.nodes()))
    label_mean_len_to_seed[subU_nodes] = np.mean(
        label_inv_seed_to_node_lengths, axis=0)
    label_mean_len_to_seed = label_mean_len_to_seed[sources]
    
    seed_features.append(label_max_len_to_seed)
    seed_features.append(label_mean_len_to_seed)

In [45]:
label_max_len_to_seed, label_mean_len_to_seed

(array([0.16666667, 0.16666667, 0.16666667, ..., 0.16666667, 0.16666667,
        0.16666667]),
 array([0.07113095, 0.07113095, 0.07113095, ..., 0.06865079, 0.06865079,
        0.06865079]))

In [46]:
label_max_len_to_seed[:100]

array([0.16666667, 0.16666667, 0.16666667, 0.14285714, 0.14285714,
       0.14285714, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
       0.16666667, 0.        , 0.16666667, 0.16666667, 0.16666667,
       0.16666667, 0.16666667, 0.14285714, 0.14285714, 0.14285714,
       0.25      , 0.25      , 0.25      , 0.25      , 0.        ,
       0.14285714, 0.14285714, 0.14285714, 0.14285714, 0.14285714,
       0.2       , 0.2       , 0.2       , 0.2       , 0.        ,
       0.        , 0.        , 0.        , 0.2       , 0.2       ,
       0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
       0.2       , 0.2       , 0.2       , 0.2       , 0.16666667,
       0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
       0.16666667, 0.16666667, 0.16666667, 0.2       , 0.2       ,
       0.2       , 0.2       , 0.2       , 0.125     , 0.25      ,
       0.25      , 0.25      , 0.25      , 0.25      , 0.25      ,
       0.25      , 0.16666667, 0.16666667, 0.16666667, 0.16666

In [34]:
list(max(nx.connected_component_subgraphs(U), key=len).nodes())

NodeView((1, 5, 8, 10, 12, 13, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 38, 39, 40, 42, 43, 44, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 65, 66, 69, 70, 72, 75, 76, 77, 78, 79, 80, 81, 83, 84, 87, 88, 90, 91, 92, 93, 95, 96, 98, 99, 100, 101, 103, 104, 105, 106, 107, 110, 113, 114, 115, 118, 119, 122, 123, 124, 126, 128, 130, 131, 132, 134, 135, 136, 137, 138, 142, 144, 147, 148, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 167, 168, 169, 170, 172, 173, 177, 178, 180, 181, 184, 186, 188, 189, 190, 191, 194, 195, 197, 198, 200, 201, 203, 204, 205, 206, 208, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 224, 226, 227, 228, 229, 230, 231, 232, 234, 236, 237, 240, 241, 242, 243, 244, 246, 247, 249, 250, 252, 253, 254, 255, 258, 259, 260, 263, 265, 266, 267, 268, 269, 272, 273, 274, 280, 285, 286, 287, 289, 292, 293, 294, 298, 300, 302, 303, 304, 307, 308, 311, 312, 313, 314, 316, 317, 318, 319, 321, 322, 325, 330, 331, 332, 