# GCN on Node Classification

## Dependencies and import

In [None]:
!pip install --quiet spektral
!pip install --quiet neural_structured_learning
!pip install --quiet tensorflow_hub
!pip install --quiet tensorflow-text


In [None]:
"""
This example implements the experiments on citation networks from the paper:
Semi-Supervised Classification with Graph Convolutional Networks (https://arxiv.org/abs/1609.02907)
Thomas N. Kipf, Max Welling
"""
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop


from spektral.data.loaders import SingleLoader
from spektral.datasets.citation import Citation
from spektral.layers import GCNConv
from spektral.models.gcn import GCN
from spektral.transforms import LayerPreprocess

from spektral.data import Graph
from spektral.data import Dataset
from spektral.transforms import GCNFilter
import pandas as pd
from spektral.utils.sparse import sp_matrix_to_sp_tensor


In [None]:
import neural_structured_learning as nsl

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

In [None]:
tf.keras.backend.clear_session()

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print(
    "GPU is",
    "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.9.1
Eager mode:  True
Hub version:  0.12.0
GPU is NOT AVAILABLE


## Load data

In [None]:
# Load data
# dataset = Citation(data, normalize_x=True, transforms=[LayerPreprocess(GCNConv)])
imdb = tf.keras.datasets.imdb
(pp_train_data, pp_train_labels), (pp_test_data, pp_test_labels) = (
    imdb.load_data(num_words=10000))



In [None]:
# This block limits how much is loaded to keep debugging short
# don't run on the final experiments

limit = 1000

pp_train_data = pp_train_data[0:limit]
pp_train_labels = pp_train_labels[0:limit]
pp_test_data = pp_test_data[0:limit]
pp_test_labels = pp_test_labels[0:limit]

In [None]:
print('Training entries: {}, labels: {}'.format(
    len(pp_train_data), len(pp_train_labels)))
training_samples_count = len(pp_train_data)

Training entries: 1000, labels: 1000


In [None]:
print(pp_train_data[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [None]:
len(pp_train_data[0]), len(pp_train_data[1])

(218, 189)

In [None]:
def sampleFunction(inputFeature):
    """
    This is a description of the function
    
    Args:
        inputFeature - (np.ndarray) This is what the feature is
    Returns:
        result - (int) This is what is returned
    """

    result = 55
    return result

def buildReverseWordIndex(dataset):
    """
    Convert the index back to words with proper accounting for 
    the special characters reserved at the beginning of the dictionary

    Args: 
        dataset - (keras.dataset) The dataset to use
    Returns:
        buildReverseWordIndex - (dict) A dictionary mapping words to an integer index
    """
    wordIndex = dataset.get_word_index()

    # The first indices are reserved
    wordIndex = {k: (v + 3) for k, v in wordIndex.items()}
    wordIndex['<PAD>'] = 0
    wordIndex['<START>'] = 1
    wordIndex['<UNK>'] = 2  # unknown
    wordIndex['<UNUSED>'] = 3
    return dict((value, key) for (key, value) in wordIndex.items())

def decodeReview(text, reverseWordIndex):
    """
    Uses build_reverse_word_index to decode original data format into text
    
    Args:
        text - (np.ndarray) The text to decode
        reverseWordIndex - (dict) The reverse word index to use
    Returns:
        decodedReview - (string) The decoded review
    """
    return ' '.join([reverseWordIndex.get(i, '?') for i in text])


In [None]:
reverseWordIndex = buildReverseWordIndex(imdb)
decodeReview(pp_train_data[0], reverseWordIndex)

"<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for wh

## Generate BERT Embedding

In [None]:
pretrained_embedding = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

In [None]:
encoder_inputs = preprocessor(text_input)

encoder = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2',
    trainable=True)

outputs = encoder(encoder_inputs)

pooled_output = outputs['pooled_output'] # [batch_size, 128].
# [batch_size, seq_length, 128].

sequence_output = outputs['sequence_output']

In [None]:
def int64Feature(value):
    """
    Returns int64 tf.train.Feature.

    Args:
        value - (np.ndarray) array of ints
    """
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value.tolist()))


def bytesFeature(value):
    """
    Returns bytes tf.train.Feature.

    Args:
        value - (string) string
    """
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))


def floatFeature(value):
    """
    Returns float tf.train.Feature.

    Args:
        value - (np.ndarray) array of floats

    """
    return tf.train.Feature(float_list=tf.train.FloatList(value=value.tolist()))


In [None]:
def createBertEmbeddingExample(wordVector, recordID, reverseWordIndex, encoder, preprocessor):
    """
    Create tf.Example containing the sample's embedding and its ID.
    
    Args:
        wordVector - (np.ndarray) the text to decode
        recordId - (int) ID of the sample
        reverseWordIndex - (dict) The reverse word index to use
        encoder - (string) encoder name
        preprocessor - (string) preprocessor name
    Returns:
        example - (tf.Example) tf.Example containing the sample's embedding and its ID
    """

    text = decodeReview(wordVector, reverseWordIndex)

    # Shape = [batch_size,].
    sentenceEmbedding = encoder(preprocessor(tf.reshape(text, shape=[-1, ])))['pooled_output']
    
    # Flatten the sentence embedding back to 1-D.
    sentenceEmbedding = tf.reshape(sentenceEmbedding, shape=[-1])
    
    features = {
        'id': bytesFeature(str(recordID)),
        'embedding': floatFeature(sentenceEmbedding.numpy())
    }
    return tf.train.Example(features=tf.train.Features(feature=features))


def createBertEmbedding(wordVectors, outputPath, startingRecordId, reverseWordIndex, encoder, preprocessor):
    """
    Create full set of BERT embeddings

    Args:
        wordVectors - (np.ndarray) all text to decode
        outputPath - (string) path to output file
        startingRecordId - (int) ID of the first sample
        reverseWordIndex - (dict) The reverse word index to use
        encoder - (string) encoder name
        preprocessor - (string) preprocessor name
    Returns:
        recordID - (int) ID of the last sample
    """
    recordID = int(startingRecordId)
    with tf.io.TFRecordWriter(outputPath) as writer:
        for word_vector in wordVectors:
            example = createBertEmbeddingExample(word_vector, recordID, reverseWordIndex, encoder, preprocessor)
            recordID = recordID + 1
            writer.write(example.SerializeToString())
    return recordID


In [None]:
!mkdir /tmp/imdb

mkdir: cannot create directory ‘/tmp/imdb’: File exists


In [None]:
# Generate full BERT embeddings

bertEmbeddingsPath = '/tmp/imdb/bertEmeddings.tfr'
createBertEmbedding(pp_train_data, bertEmbeddingsPath, 0, reverseWordIndex, encoder, preprocessor)


1000

In [None]:
!wc -l /tmp/imdb/bertEmeddings.tfr

9329 /tmp/imdb/bertEmeddings.tfr


In [None]:
def createBertEmbeddingExample_tf(wordVector, reverseWordIndex, encoder, preprocessor):
    """
    Create tf.Example containing the sample's embedding and its ID.
    
    Args:
        wordVector - (np.ndarray) the text to decode
        recordId - (int) ID of the sample
        reverseWordIndex - (dict) The reverse word index to use
        encoder - (string) encoder name
        preprocessor - (string) preprocessor name
    Returns:
        example - (tf.Example) tf.Example containing the sample's embedding and its ID
    """

    text = decodeReview(wordVector, reverseWordIndex)

    # Shape = [batch_size,].
    sentenceEmbedding = encoder(preprocessor(tf.reshape(text, shape=[-1, ])))['pooled_output']
    
    # Flatten the sentence embedding back to 1-D.
    sentenceEmbedding = tf.reshape(sentenceEmbedding, shape=[-1])
    
    # features = {
    #     'id': bytesFeature(str(recordID)),
    #     'embedding': floatFeature(sentenceEmbedding.numpy())
    # }
    # return tf.train.Example(features=tf.train.Features(feature=features))
    return sentenceEmbedding

def createBertEmbedding_tf(wordVectors, reverseWordIndex, encoder, preprocessor):
    """
    Create full set of BERT embeddings

    Args:
        wordVectors - (np.ndarray) all text to decode
        outputPath - (string) path to output file
        startingRecordId - (int) ID of the first sample
        reverseWordIndex - (dict) The reverse word index to use
        encoder - (string) encoder name
        preprocessor - (string) preprocessor name
    Returns:
        recordID - (int) ID of the last sample
    """
    sentenceEmbeddings = []
    for word_vector in wordVectors:
        example = createBertEmbeddingExample_tf(word_vector, reverseWordIndex, encoder, preprocessor)
        sentenceEmbeddings.append(example)
    sentenceEmbeddings_np = np.array(sentenceEmbeddings)
    return sentenceEmbeddings_np


In [None]:
bertEmbeddings_np = createBertEmbedding_tf(pp_train_data, reverseWordIndex, encoder, preprocessor)
bertEmbeddings_np

array([[-0.99705344,  0.13148627, -0.9839852 , ...,  0.46490347,
         0.9740655 ,  0.9860606 ],
       [-0.99880314, -0.01381373, -0.94920063, ...,  0.7923372 ,
         0.963446  ,  0.9852994 ],
       [-0.9977645 ,  0.08967871, -0.99587804, ...,  0.7926493 ,
         0.9919198 ,  0.98849094],
       ...,
       [-0.99963784,  0.01017996, -0.9731926 , ...,  0.8962663 ,
         0.9838081 ,  0.90645903],
       [-0.99953955, -0.00890382, -0.7950634 , ...,  0.6752377 ,
         0.9756713 ,  0.94408995],
       [-0.99809605,  0.03867956, -0.98139894, ..., -0.70533836,
         0.990776  ,  0.9948569 ]], dtype=float32)

In [None]:
np.shape(bertEmbeddings_np)

(1000, 128)

## Construct Graph Using Swivel Embedding


In [None]:
!mkdir -p /tmp/imdb

In [None]:
pretrained_embedding = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1'

hub_layer = hub.KerasLayer(
    pretrained_embedding, input_shape=[], dtype=tf.string, trainable=True)

In [None]:
def create_embedding_example(word_vector, record_id):
  """Create tf.Example containing the sample's embedding and its ID."""

  text = decodeReview(word_vector, reverseWordIndex)

  # Shape = [batch_size,].
  sentence_embedding = hub_layer(tf.reshape(text, shape=[-1,]))

  # Flatten the sentence embedding back to 1-D.
  sentence_embedding = tf.reshape(sentence_embedding, shape=[-1])

  features = {
      'id': bytesFeature(str(record_id)),
      'embedding': floatFeature(sentence_embedding.numpy())
  }
  return tf.train.Example(features=tf.train.Features(feature=features))


def create_embeddings(word_vectors, output_path, starting_record_id):
  record_id = int(starting_record_id)
  with tf.io.TFRecordWriter(output_path) as writer:
    for word_vector in word_vectors:
      example = create_embedding_example(word_vector, record_id)
      record_id = record_id + 1
      writer.write(example.SerializeToString())
  return record_id


# Persist TF.Example features containing embeddings for training data in
# TFRecord format.
create_embeddings(pp_train_data, '/tmp/imdb/embeddings.tfr', 0)

1000

In [None]:
graph_builder_config = nsl.configs.GraphBuilderConfig(
    similarity_threshold=0.99, lsh_splits=32, lsh_rounds=15, random_seed=42)
nsl.tools.build_graph_from_config(['/tmp/imdb/embeddings.tfr'],
                                  '/tmp/imdb/graph_99.tsv',
                                  graph_builder_config)

In [None]:
!wc -l /tmp/imdb/graph_99.tsv

1486 /tmp/imdb/graph_99.tsv


## Construct Graph Using BERT Embedding

In [None]:
graph_builder_config = nsl.configs.GraphBuilderConfig(
    similarity_threshold=0.99, lsh_splits=32, lsh_rounds=15, random_seed=42)
nsl.tools.build_graph_from_config(['/tmp/imdb/bertEmeddings.tfr'],
                                  '/tmp/imdb/graph_99.tsv',
                                  graph_builder_config)

In [None]:
!wc -l /tmp/imdb/graph_99.tsv

60 /tmp/imdb/graph_99.tsv


## Convert NSL Graph to Spektral Graph



In [None]:
df = pd.read_csv('/tmp/imdb/graph_99.tsv', sep="\t") 
imdb_graph_a_adjsency_matrix = df.values
size = len(pp_train_data)
imdb_graph_a = np.zeros((size, size))
for row in imdb_graph_a_adjsency_matrix:
  imdb_graph_a[int(row[0]),int(row[1])] = row[2]




In [None]:
max_seq_length_slice = 256
# pp_train_data_sliced = np.zeros((len(pp_train_data), max_seq_length_slice))
# i = 0
# while i < len(pp_train_data):
#   if len(pp_train_data[i]) > max_seq_length_slice:
#     pp_train_data_sliced[i] = pp_train_data[i][:max_seq_length_slice]
#   else:
#     pp_train_data_sliced[i] = np.append(np.array(pp_train_data[i]), np.zeros(max_seq_length_slice - len(pp_train_data[i])))
#   i = i + 1




In [None]:
print(len(bertEmbeddings_np),len(bertEmbeddings_np[0]))
print(bertEmbeddings_np.shape)

1000 128
(1000, 128)


In [None]:
pp_train_labels_binary = np.zeros((len(pp_train_labels),2))
j = 0
for _ in pp_train_labels:
  if _ == 1:
    pp_train_labels_binary[j][1] = 1
  else:
    pp_train_labels_binary[j][0] = 1
  j = j + 1


In [None]:
imdb_graph=Graph(a=imdb_graph_a, x=bertEmbeddings_np, y=pp_train_labels_binary)


In [None]:
class MyDataset(Dataset):
    """
    A dataset of five random graphs.
    """
    def __init__(self, **kwargs):
        # self.nodes = nodes
        # self.feats = feats

        super().__init__(**kwargs)

    def download(self):
        # data = ...  # Download from somewhere
        path = '/tmp/imdb'
        # Create the directory
        
        # os.mkdir(path)
        filename = os.path.join(path, 'imdb_graph')
        np.savez(filename, x=imdb_graph.x, a=imdb_graph.a, y=imdb_graph.y)

    def read(self):
        # We must return a list of Graph objects
        output = []

        # for i in range(5):
        #     data = np.load(os.path.join(self.path, f'graph_{i}.npz'))
        #     output.append(
        #         Graph(x=data['x'], a=data['a'], y=data['y'])
        #     )
        output.append(imdb_graph)
        return output           

In [None]:
dataset = MyDataset()
dataset[0]
dataset.apply(GCNFilter())

In [None]:
dataset[0].x.shape

(1000, 128)

## Train the model

In [None]:
from tensorflow.keras.optimizers import SGD
learning_rate = 5e-3
seed = 0
epochs = 2000
patience = 10

tf.random.set_seed(seed=seed)  # make weight initialization reproducible

In [None]:
model = GCN(n_labels=imdb_graph.n_labels)

In [None]:
model.compile(
    optimizer=Adam(learning_rate),
    loss=tf.keras.losses.BinaryCrossentropy(),
    weighted_metrics=["acc"],
)

In [None]:
# Train model
loader_tr = SingleLoader(dataset)
model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    epochs=epochs,
    # callbacks=[EarlyStopping(patience=patience, restore_best_weights=True)],
)

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

<keras.callbacks.History at 0x7fa8b4303690>

## Evaluate the model

In [None]:
# Evaluate model
print("Evaluating model.")
loader_te = SingleLoader(dataset, sample_weights=weights_te)
eval_results = model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
print("Done.\n" "Test loss: {}\n" "Test accuracy: {}".format(*eval_results))

Evaluating model.


NameError: ignored