### Load packages

In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.0.0+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [2]:
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Dataset, Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.transforms import RandomNodeSplit

In [3]:
import pandas as pd
import numpy as np
import json
import glob
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [4]:
import sys
is_colab = 'google.colab' in sys.modules

if is_colab:
    from google.colab import auth
    auth.authenticate_user()

In [5]:
from google.colab import drive
drive.mount('/content/drive')
drive_path = '/content/drive/Shareddrives/CS523/lastFM-data/'

Mounted at /content/drive


### Load dataset: LastFM

In [6]:
df_feat = pd.read_json(drive_path + 'processed_feature.json')   # load features
arr_feat = df_feat.T.values
df_edge = pd.read_csv(drive_path + 'lastfm_asia_edges.csv')   # load edge indices
arr_edge = df_edge.T.values
df_tar = pd.read_csv(drive_path + 'lastfm_asia_target.csv', index_col=0)   # load targets
arr_tar = df_tar.values
num_features = arr_feat.shape[1]
num_classes = len(df_tar.target.unique())
classes = df_tar.target.sort_values().unique()

print('Dataset:')
print('====================')
print(f'Number of nodes: {arr_feat.shape[0]}')
print(f'Number of features: {num_features}')
print(f'Number of edges: {arr_edge.shape[1]}')
print(f'Number of distinct regions: {num_classes}')
print(f'All region classes: {classes}')

Dataset:
Number of nodes: 7624
Number of features: 7842
Number of edges: 27806
Number of distinct regions: 18
All region classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]


In [7]:
graph_metadata = {'data': arr_feat, 'edge_index': arr_edge, 'label': arr_tar}

### Create GraphDataset

In [8]:
class GraphDataset(Dataset):
    
    def __init__(self, root, out_dir, metadata, transform=None, pre_transform=None, pre_filter=None):
        self.root = root
        self.out_dir = out_dir
        self.metadata = metadata
        super(GraphDataset, self).__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        return root

    @property
    def processed_file_names(self):
        return glob.glob(f'{self.out_dir}/*.pt')

    def download(self):
        pass
    
    
    def process(self):

        self.node_features = torch.tensor(self.metadata['data'], dtype=torch.float)
        self.edge_index = torch.tensor(self.metadata['edge_index'], dtype=torch.int64)
        self.label = torch.tensor(self.metadata['label'], dtype=torch.int64)

        data = Data(x = self.node_features, 
                    edge_index = self.edge_index,
                    # edge_attr = None,
                    y = self.label)

        torch.save(data, f'{self.out_dir}/data.pt')

    def len(self):
        return self.node_features.shape[0]

    def get(self):
        data = torch.load(f'{self.out_dir}/data.pt')
        return data

In [9]:
root = ''
out_dir = 'graph_data'
os.mkdir(out_dir)
metadata = graph_metadata
dataset = GraphDataset(root=root, out_dir=out_dir, metadata=metadata)

Processing...
Done!


In [10]:
# inspect the graph dataset
data = dataset.get()
print()
print(data)
print('=============================================================')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')


Data(x=[7624, 7842], edge_index=[2, 27806], y=[7624, 1])
Number of nodes: 7624
Number of edges: 27806


### Train/valid/test split

In [11]:
# train_ratio = 0.6
valid_ratio = 0.2
test_ratio = 0.2
split = RandomNodeSplit(num_val = valid_ratio, num_test = test_ratio)
split_graph = split(data)

In [12]:
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Number of validation nodes: {data.val_mask.sum()}')
print(f'Number of testing nodes: {data.test_mask.sum()}')

Number of training nodes: 4574
Number of validation nodes: 1525
Number of testing nodes: 1525


### Build GNN

In [13]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels1, hidden_channels2, hidden_channels3):
        super().__init__()
        # torch.manual_seed(42)
        self.conv1 = GCNConv(num_features, hidden_channels1)
        self.conv2 = GCNConv(hidden_channels1, hidden_channels2)
        self.conv3 = GCNConv(hidden_channels2, hidden_channels3)
        self.linear = torch.nn.Linear(hidden_channels3, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        # x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.linear(x)
        return x

In [14]:
model = GCN(256, 128, 64)
print(model)

GCN(
  (conv1): GCNConv(7842, 256)
  (conv2): GCNConv(256, 128)
  (conv3): GCNConv(128, 64)
  (linear): Linear(in_features=64, out_features=18, bias=True)
)


In [15]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=5e-4)

In [16]:
def train():
      model.train()
      optimizer.zero_grad()
      out = model(data.x, data.edge_index)
      loss = criterion(out[data.train_mask], data.y[data.train_mask].flatten())
      loss.backward()
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(data.x, data.edge_index)
      pred = out.argmax(dim=1)
      test_correct = pred[data.test_mask] == data.y[data.test_mask].flatten()
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
      return test_acc

for epoch in range(100):
    loss = train()
    print(f'Epoch: {epoch+1:02d}, Loss: {loss:.4f}')

Epoch: 01, Loss: 2.9054
Epoch: 02, Loss: 2.7162
Epoch: 03, Loss: 2.5399
Epoch: 04, Loss: 2.3837
Epoch: 05, Loss: 2.2473
Epoch: 06, Loss: 2.1221
Epoch: 07, Loss: 1.9989
Epoch: 08, Loss: 1.8814
Epoch: 09, Loss: 1.7704
Epoch: 10, Loss: 1.6623
Epoch: 11, Loss: 1.5577
Epoch: 12, Loss: 1.4608
Epoch: 13, Loss: 1.3737
Epoch: 14, Loss: 1.2971
Epoch: 15, Loss: 1.2299
Epoch: 16, Loss: 1.1715
Epoch: 17, Loss: 1.1207
Epoch: 18, Loss: 1.0762
Epoch: 19, Loss: 1.0361
Epoch: 20, Loss: 0.9980
Epoch: 21, Loss: 0.9617
Epoch: 22, Loss: 0.9281
Epoch: 23, Loss: 0.8976
Epoch: 24, Loss: 0.8690
Epoch: 25, Loss: 0.8415
Epoch: 26, Loss: 0.8151
Epoch: 27, Loss: 0.7902
Epoch: 28, Loss: 0.7661
Epoch: 29, Loss: 0.7432
Epoch: 30, Loss: 0.7214
Epoch: 31, Loss: 0.7004
Epoch: 32, Loss: 0.6802
Epoch: 33, Loss: 0.6612
Epoch: 34, Loss: 0.6429
Epoch: 35, Loss: 0.6252
Epoch: 36, Loss: 0.6083
Epoch: 37, Loss: 0.5921
Epoch: 38, Loss: 0.5767
Epoch: 39, Loss: 0.5624
Epoch: 40, Loss: 0.5488
Epoch: 41, Loss: 0.5356
Epoch: 42, Loss:

In [17]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.8184


In [28]:
from torch_geometric.explain import Explainer, GNNExplainer
# Initialize explainer
explainer = Explainer(
    model=model,
    algorithm=GNNExplainer(epochs=200),
    explanation_type='model',
    node_mask_type='attributes',
    edge_mask_type='object',
    model_config=dict(
        mode='multiclass_classification',
        task_level='node',
        return_type='log_probs',
    ),
)
node_index = 500
explanation = explainer(data.x, data.edge_index, index=node_index)
print(f'Generated explanations in {explanation.available_explanations}')

path = 'feature_importance.png'
explanation.visualize_feature_importance(path, top_k=10)
print(f"Feature importance plot has been saved to '{path}'")

path = 'subgraph.pdf'
explanation.visualize_graph(path)
print(f"Subgraph visualization plot has been saved to '{path}'")

Generated explanations in ['node_mask', 'edge_mask']
Feature importance plot has been saved to 'feature_importance.png'
Subgraph visualization plot has been saved to 'subgraph.pdf'


In [25]:
explanation.edge_mask

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [19]:
# node_feat_mask, edge_feat_mask = explainer.explain_node(node_idx, data.x, data.edge_index)

In [26]:
path = 'graph.pdf'
explanation.visualize_graph(path=path, backend="graphviz")