- 7 classes of documents
- 20 docs/class labeled
- features/document: 1433 words, an undirected graph where each doc is a node and citations = edges
- classify documents

Resources
---------
* Dataset: https://graphsandnetworks.com/the-cora-dataset/
* Intro videos, only 20 min total: https://www.youtube.com/watch?v=XRHhtLgpXqg and https://www.youtube.com/watch?v=Zjx25h8DnIo
* very good, concise blog post by author of paper: https://tkipf.github.io/graph-convolutional-networks/
* paper github repo https://github.com/tkipf/gcn/blob/master/gcn/train.py

In [None]:
# Load data, only needed once
!mkdir data
!curl https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz -o data/cora.tgz
!tar -xzf ./data/cora.tgz -C ./data/
!rm ./data/cora.tgz

In [63]:
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import networkx as nx
import tensorflow as tf
import stellargraph as sg
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping
from stellargraph.layer.gcn import GraphConvolution, GatherIndices

## Question 23
classify using graph convolutional nets

### Failures

In [None]:
# Unpack data
edgelist = pd.read_csv('./data/cora/cora.cites', sep='\t', header=None, names=["target", "source"])
edgelist["label"] = "cites"
g = nx.from_pandas_edgelist(edgelist, edge_attr="label")
nx.set_node_attributes(g, "paper", "label")

n_features = 1433
feature_names = [f"w_{i}" for i in range(n_features)]
column_names =  feature_names + ["subject"]
node_data = pd.read_csv('./data/cora/cora.content', sep='\t', header=None, names=column_names)

# Get 20 instances per class for training
train_nodes = []
classes = node_data.subject.unique()
for subject in classes:
    train_nodes += node_data[node_data['subject'] == subject].sample(20).index.to_list()

In [None]:
import stellargraph as sg
import tensorflow as tf
from sklearn import model_selection


nodes, edges, targets = node_data, edgelist, node_data[["subject"]]

# Use scikit-learn to compute training and test sets
train_targets, test_targets = model_selection.train_test_split(targets, train_size=0.5)

# convert the raw data into StellarGraph's graph format for faster operations
# graph = sg.StellarGraph(nodes, edges)
graph = sg.StellarGraph.from_networkx(g, node_features=node_data[feature_names])

generator = sg.mapper.FullBatchNodeGenerator(graph, method="gcn")

# two layers of GCN, each with hidden dimension 16
gcn = sg.layer.GCN(layer_sizes=[16, 16], generator=generator)
x_inp, x_out = gcn.in_out_tensors() # create the input and output TensorFlow tensors

# use TensorFlow Keras to add a layer to compute the (one-hot) predictions
predictions = tf.keras.layers.Dense(units=7, activation="softmax")(x_out)

# use the input and output tensors to create a TensorFlow Keras model
model = tf.keras.Model(inputs=x_inp, outputs=predictions)

# prepare the model for training with the Adam optimiser and an appropriate loss function
model.compile("adam", loss="categorical_crossentropy", metrics=["accuracy"])

# train the model on the train set
model.fit(generator.flow(train_targets.index, train_targets), epochs=5)

# check model generalisation on the test set
(loss, accuracy) = model.evaluate(generator.flow(test_targets.index, test_targets))
print(f"Test set: loss = {loss}, accuracy = {accuracy}")

In [None]:
# Set Neural Net Hyperparameters
learning_rate = 1e-2
epochs = 2 # 200
hidden_layer_size = 16 # # of units in hidden layer 1
dropout = 0.5
weight_decay = 5e-4
early_stopping = 10
max_degree = 3


# Create StellarGraph object
G = sg.StellarGraph.from_networkx(g, node_features=node_data[feature_names])

# Create train and test data
train_data = node_data.loc[train_nodes]
test_data = node_data.drop(train_nodes)

# Create target vectors
train_targets = train_data['subject']
test_targets = test_data['subject']


# Create generator for training data
generator = sg.mapper.FullBatchNodeGenerator(G, method="gcn")
train_gen = generator.flow(train_data.index, train_targets)

# Create GCN model
gcn = sg.layer.GCN(
    layer_sizes=[hidden_layer_size, len(classes)],
    activations=["relu", "softmax"],
    generator=generator,
    dropout=dropout,
)

# Create input and output layers
x_inp, x_out = gcn.in_out_tensors()

# Create model
model = tf.keras.Model(inputs=x_inp, outputs=x_out)

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=["acc"],
)

In [None]:
# Train model
history = model.fit(
    train_gen,
    epochs=epochs,
    verbose=2,
    validation_data=None,
    shuffle=False,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=early_stopping, restore_best_weights=True)
    ],
)

# # Create test generator
# test_gen = generator.flow(test_data.index, test_targets)

# # Evaluate model
# test_metrics = model.evaluate(test_gen)

# print("\nTest Set Metrics:")
# for name, val in zip(model.metrics_names, test_metrics):
#     print("\t{}: {:0.4f}".format(name, val))


In [None]:
# # Create node embeddings
# embedding_model = tf.keras.Model(inputs=x_inp, outputs=x_out)
# emb = embedding_model.predict(generator.flow(G.nodes()))

# # Create node embeddings
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# import numpy as np

# # Create PCA model

https://stellargraph.readthedocs.io/en/stable/demos/node-classification/gcn-node-classification.html

In [None]:
import pandas as pd
import os

import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import preprocessing, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
learning_rate = 1e-2
epochs = 2 # 200
dropout = 0.5


# Load Dataset
dataset = sg.datasets.Cora()
G, node_subjects = dataset.load()

train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=140, stratify=node_subjects
)
val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=500, stratify=test_subjects
)

# Convert data to one-hot encoding
target_encoding = preprocessing.LabelBinarizer()
train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)

# Create generator for training and validation data
generator = FullBatchNodeGenerator(G, method="gcn")
train_gen = generator.flow(train_subjects.index, train_targets)
validation_gen = generator.flow(val_subjects.index, val_targets)

# Create GCN model
gcn = GCN(
    layer_sizes=[16, 16], activations=["relu", "relu"], generator=generator, dropout=dropout
)

# Create input, output, and prediction layers
x_inp, x_out = gcn.in_out_tensors()
predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

# Create model
model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(learning_rate=learning_rate),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

# Track performance
es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)

# Train model
history = model.fit(
    train_gen,
    epochs=epochs,
    validation_data=validation_gen,
    verbose=2,
    shuffle=False,
    callbacks=[es_callback],
    batch_size=1,
)

### Current Attempt

TODO: I need to recreate all the indices

In [105]:
# Unpack Data
edgelist = pd.read_csv('./data/cora/cora.cites', sep='\t', header=None, names=["target", "source"])
column_names = [f"w_{i}" for i in range(1433)] + ["subject"]
node_data = pd.read_csv('./data/cora/cora.content', sep='\t', header=None, names=column_names)
node_data['subject'] = pd.Categorical(node_data.subject).codes
# node_data['subject'] = node_data['subject'].astype('category')

# Nodes have weird values like 11573 when there are only 2708 nodes. We map them to [0,2708]
node_mapping = dict(zip(node_data.index, np.arange(len(node_data.index))))
edgelist['target'] = edgelist['target'].map(node_mapping)
edgelist['source'] = edgelist['source'].map(node_mapping)
node_data.index = node_data.index.map(node_mapping)

# Get 20 instances per class for training
train_nodes = []
for subject in node_data.subject.unique():
    train_nodes += node_data[node_data['subject'] == subject].sample(20).index.to_list()
test_nodes = list(set(node_data.index) - set(train_nodes))

# Build Graph
g = sg.StellarGraph(node_data, edgelist)

# Data Preprocessing
target_encoding = LabelBinarizer()
y_train = target_encoding.fit_transform(node_data.loc[train_nodes]['subject'])[None]
y_test = target_encoding.transform(node_data.loc[test_nodes]['subject'])[None]

In [106]:
# Calculate Normalized Adjacency Matrix from GCN Paper
A      = g.to_adjacency_matrix(weighted=True) # adjacency matrix
A_t    = A + np.eye(A.shape[0]) - A.diagonal() # add self-connections to each node
D_t    = np.diag(np.power(np.array(A_t.sum(axis=1)), -0.5)[:,0]) # symmetric normalization matrix
A_norm = D_t @ A @ D_t # normalized adjacency matrix

In [107]:
features_input = node_data.to_numpy()[None]
A_input = A_norm[None]
train_indices = np.array(train_nodes)[None]

In [108]:
# Initialise GCN parameters
kernel_initializer="glorot_uniform"
bias = True
learning_rate=1e-2
bias_initializer="zeros"
n_layers = 2
layer_sizes = [32, 32]
dropout = 0.5
n_features = features_input.shape[2]
n_nodes = features_input.shape[1] # number of classes
n_classes = 7
early_stopping = 10
batch_size = 32
epochs = 1 # 200


# Create input layer
x_features = Input(batch_shape=(1, n_nodes, n_features))
x_indices = Input(batch_shape=(1, None), dtype="int32")
x_adjacency = Input(batch_shape=(1, n_nodes, n_nodes))
x_input  = [x_features, x_indices, x_adjacency]

# Build the model
x = Dropout(dropout)(x_features)
x = GraphConvolution(
    32,
    activation='relu',
    use_bias=True,
    kernel_initializer=kernel_initializer,
    bias_initializer=bias_initializer
)([x, x_adjacency])
x = Dropout(dropout)(x)
x = GraphConvolution(
    32,
    activation='relu',
    kernel_initializer=kernel_initializer,
    bias_initializer=bias_initializer
)([x, x_adjacency])
x = GatherIndices(batch_dims=1)([x, x_indices])
output = Dense(n_classes, activation='sigmoid')(x)

model = Model(inputs=[x_features, x_indices, x_adjacency], outputs=output)
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_19 (InputLayer)          [(1, 2708, 1434)]    0           []                               
                                                                                                  
 dropout_12 (Dropout)           (1, 2708, 1434)      0           ['input_19[0][0]']               
                                                                                                  
 input_21 (InputLayer)          [(1, 2708, 2708)]    0           []                               
                                                                                                  
 graph_convolution_12 (GraphCon  (1, 2708, 32)       45920       ['dropout_12[0][0]',             
 volution)                                                        'input_21[0][0]']         

In [110]:
epochs = 20

# Compile the model
model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
    # loss=tf.losses.binary_crossentropy,
    loss=tf.losses.categorical_crossentropy,
    metrics=["acc"],
)

# Early stopping callback
es_callback = EarlyStopping(monitor="val_acc", patience=early_stopping, restore_best_weights=True)

# Train model
history = model.fit(
    x = [features_input, train_indices, A_input],
    y = y_train,
    epochs=epochs,
    verbose=2,
    # validation_data=([x_features, val_subjects.index, x_adjacency], val_targets),
    shuffle=False,
    callbacks=[es_callback],
    # batch_size=1,
)

Epoch 1/20
1/1 - 2s - loss: 1.9146 - acc: 0.3786 - 2s/epoch - 2s/step
Epoch 2/20
1/1 - 0s - loss: 1.8712 - acc: 0.3929 - 151ms/epoch - 151ms/step
Epoch 3/20
1/1 - 0s - loss: 1.7851 - acc: 0.5643 - 142ms/epoch - 142ms/step
Epoch 4/20
1/1 - 0s - loss: 1.7138 - acc: 0.5929 - 168ms/epoch - 168ms/step
Epoch 5/20
1/1 - 0s - loss: 1.5897 - acc: 0.6286 - 171ms/epoch - 171ms/step
Epoch 6/20
1/1 - 0s - loss: 1.4738 - acc: 0.6214 - 139ms/epoch - 139ms/step
Epoch 7/20
1/1 - 0s - loss: 1.3421 - acc: 0.6643 - 143ms/epoch - 143ms/step
Epoch 8/20
1/1 - 0s - loss: 1.2002 - acc: 0.7643 - 192ms/epoch - 192ms/step
Epoch 9/20
1/1 - 0s - loss: 1.0688 - acc: 0.7357 - 184ms/epoch - 184ms/step
Epoch 10/20
1/1 - 0s - loss: 0.9374 - acc: 0.8429 - 164ms/epoch - 164ms/step
Epoch 11/20
1/1 - 0s - loss: 0.8330 - acc: 0.8143 - 143ms/epoch - 143ms/step
Epoch 12/20
1/1 - 0s - loss: 0.7017 - acc: 0.8571 - 153ms/epoch - 153ms/step
Epoch 13/20
1/1 - 0s - loss: 0.6206 - acc: 0.8429 - 136ms/epoch - 136ms/step
Epoch 14/20
1/

## Question 24

## Question 25