### Load data

In [None]:
import os
import pandas as pd

In [None]:
DATA_DIR = os.path.join(os.path.realpath(''), 'data', 'heterogeneous')

p2p_df = pd.read_csv(os.path.join(DATA_DIR, 'p2p.csv'))
p2a_df = pd.read_csv(os.path.join(DATA_DIR, 'p2a.csv'))
terms_df = pd.read_csv(os.path.join(DATA_DIR, 'terms.csv'))
labels_df = pd.read_csv(os.path.join(DATA_DIR, 'labels.csv'))

In [None]:
from graph_ml.utility.graph_representation import normalize_adj_matrix

In [None]:
norm_p2p = normalize_adj_matrix(p2p_df.to_numpy(), add_self_connection=True)
norm_p2a = normalize_adj_matrix(p2a_df.to_numpy())
terms = terms_df.to_numpy()
labels = labels_df['Label'].to_numpy()

### Split data

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
train_indices, test_indices, train_labels, test_labels = train_test_split(np.arange(len(labels)), labels, test_size=0.2, random_state=0)

### Heterogeneous GCN Model

In [None]:
from itertools import zip_longest

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import layers as l

from graph_ml.utility.feed_forward_layer import FeedForwardLayer
from graph_ml.gnn.gcn.gcn_head import GCNHead
from graph_ml.gnn.gcn.gcn_layer import GCNLayer
from graph_ml.gnn.gcn.hete_gcn_layer import HeteGCNLayer

In [None]:
class HeteGCNClassifier(Model):
    def __init__(self, p_features, norm_p2p_matrix, norm_p2a_matrix, num_classes):
        super().__init__()
        
        _, n_authors = norm_p2a_matrix.shape
        norm_p2p_matrix = tf.constant(norm_p2p_matrix, dtype=tf.float32)
        norm_p2a_matrix = tf.constant(norm_p2a_matrix, dtype=tf.float32)
        
        self.author_index = tf.range(n_authors)
        self.author_embedding = l.Embedding(n_authors, 128)
        
        self.paper_features = tf.constant(p_features)
        self.p_preprocessing1 = l.Dense(128)
        
        p2p_gcn1 = GCNHead(norm_p2p_matrix, hidden_units=[128, 128])
        a2p_gcn1 = GCNHead(norm_p2a_matrix, hidden_units=[128, 128])
        self.pa2p_hete_gcn1 = HeteGCNLayer([p2p_gcn1, a2p_gcn1], combination_func_type='sum', dropout=0.2)
        
        self.p2a_gcn2 = GCNLayer(tf.transpose(norm_p2a_matrix), hidden_units=[128, 96])
        
        p2p_gcn3 = GCNHead(norm_p2p_matrix, hidden_units=[96])
        a2p_gcn3 = GCNHead(norm_p2a_matrix, hidden_units=[96])
        self.pa2p_hete_gcn3 = HeteGCNLayer([p2p_gcn3, a2p_gcn3], combination_func_type='sum')
        
        self.postprocessing1 = FeedForwardLayer([48])
        self.logits1 = l.Dense(num_classes)
        self.softmax1 = l.Softmax()
        
    def call(self, indices):
        p_features = self.p_preprocessing1(self.paper_features)
        a_features = self.author_embedding(self.author_index)
        
        p_features = self.pa2p_hete_gcn1((p_features, a_features))
        a_features = self.p2a_gcn2(p_features)
        p_features = self.pa2p_hete_gcn3((p_features, a_features))
        
        features = tf.gather(p_features, indices)
        
        features = self.postprocessing1(features)
        logits = self.logits1(features)
        return self.softmax1(logits)

### Classify nodes

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [None]:
num_classes = len(set(labels))

model = HeteGCNClassifier(terms, norm_p2p, norm_p2a, num_classes)
model.compile(optimizer=Adam(1e-4), loss=SparseCategoricalCrossentropy(), metrics=[SparseCategoricalAccuracy(name="accuracy")])
_ = model.fit(x=train_indices, y=labels[train_indices], batch_size=500, epochs=20, validation_split=0.2, verbose=0)

In [None]:
from sklearn.metrics import classification_report

In [None]:
true_labels = labels[test_indices]
predicted_labels = model(test_indices).numpy().argmax(axis=1)
print(classification_report(true_labels, predicted_labels))