In [54]:
import torch
from torch_geometric.data import Data

# Social Network Graph Dataset

### classification = f(x_feature)
### classification = f(x_feature=num_of_connected_node_to_each_node)

    - It is a social network example, in which Karate club members represented in a Graph
    - A social network of a karate club in which Karate club members represented in a Graph
    - The network captures 34 members/students of a karate club, documenting 78 links between pairs of members who interacted outside the club. 
    - Two classes: In the A conflict arose between the administrator "John A" and instructor "Mr. Hi" (pseudonyms), which led to the split of the club into two.
    - the x-feature that is being used to classify the 34 nodes is the # of connections of each node to others ==> G.degree().
        - In other words: x_feature = node_degrees
    
As seen in the graph below, it seems member34 contacted many people. As example, contact for member2 to member8 is shown below:

    [2 1]
    [3 1] [3 2]
    [4 1] [4 2] [4 3]
    [5 1]
    [6 1]
    [7 1] [7 5] [7 6]
    [8 1] [8 2] [8 3] [8 4]

<img src="img\zachary_karate_graph.png" width="750" height="750"/>


# Preprocessing:  
Generate numerical representations for each node in the graph (node degree in this case).

    - Reading the Karate_club graph data from networkx
    - Defining the labels (for each 34 nodes)
    - adjacent matrix [in COO sprse format]
    - Edge data in COO coordinate (the edge_index)
    - x_feature = Embedding ==> Which is the selected and simplest numerical representation of our nodes ==> We actually use the scaled embedding which is the scaled degree of freedom for each node
            ==> embeddings = np.array(list( dict(G.degree()).values() )) ==> Simply shows how many connections each node has
    - y_label = Two classes:
        G.nodes[1] ==> {'club': 'Mr. Hi'}
        G.nodes[9] ==> {'club': 'Officer'}
    - x_feature [=embedding] and edge_index are useful as input to PyG convloutional layer [this is how the library written]:
        self.conv1(x_feature, edge_index)

    
**Note**: For the numerical representation for nodes, we can use graph properties like degree or use different embedding generation methods like node2vec, DeepWalk etc. In this example, we will be using **node degree as its numerical representation**.

Using COO to save sparse matrix in memory ==> https://matteding.github.io/2019/04/25/sparse-matrices/


In [1]:
import networkx as nx
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

# load graph from networkx library
G = nx.karate_club_graph()

# retrieve the labels for each node ==> convert True/False to 0/1 for a binary classifications
labels = np.asarray([G.nodes[i]['club'] != 'Mr. Hi' for i in G.nodes]).astype(np.int64)

# print("# of nodes: ", G.number_of_nodes())
# print("nodes in G: ", G.nodes())

In [2]:
# create edge index from
# good source on how to store a sparse matrix in COO format: ==> https://matteding.github.io/2019/04/25/sparse-matrices/
adj = nx.to_scipy_sparse_matrix(G).tocoo() # sparsematrix object in COO coordinate and format

rows_idx = adj.row.astype(np.int64) # getting row idx in COO format
col_idx = adj.col.astype(np.int64) # getting col idx in COO format

rows_idx_torch = torch.from_numpy(rows_idx).to(torch.long)
col_idx_torch = torch.from_numpy(col_idx).to(torch.long)

edge_index = torch.stack([rows_idx_torch, col_idx_torch], dim=0) # edge idx is simply what node is connected to what node

# using degree as embedding ==> basically each node is connected to how many other nodes
# print("Grapgh degree: ",  dict(G.degree())   )
embeddings = np.array(list( dict(G.degree()).values() ))

# normalizing degree values
scale = StandardScaler()
embeddings = scale.fit_transform(embeddings.reshape(-1,1))  # make it N-rows and 1 column

# The PyG Custom Dataset

The KarateDataset class inherits from the InMemoryDataset class and use a Data object to collate all information relating to the karate club dataset. The graph data is then split into train and test sets, thereby creating the train and test masks using the splits.

The data object contains the following variables:

    - "Data(edge_index=[2, 156], num_nodes=34, x=[34, 1], y=[34], num_classes=2, train_mask=[34], test_mask=[34])"
    - As seen above, the feature vector, x, for each of 34 node is a vector of size 1. It can be a size of m in more complex problems



This custom dataset can now be used with **several graph neural network models** from the Pytorch Geometric library. Letâ€™s pick a **Graph Convolutional Network model** and use it to predict the missing labels on the test set.

$Note$: PyG library focuses more on _node classification task_ but it can also be used for _link prediction_.

Reference: 
https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html#exercises

In [3]:
import torch
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import torch_geometric.transforms as T


# PyG custom dataset
class KarateDataset(InMemoryDataset):
    def __init__(self, transform=None):
        super(KarateDataset, self).__init__('.', transform, None, None)

        data = Data(edge_index=edge_index)
        
        data.num_nodes = G.number_of_nodes()
        
        # embedding = x_feature ==> here for simplicity x_feature for each node = [# of connected nodes]
        data.x = torch.from_numpy(embeddings).type(torch.float32)
        
        # labels
        y = torch.from_numpy(labels).type(torch.long)
        data.y = y.clone().detach()
        
        data.num_classes = 2

        # splitting the data into train, validation and test
        X_feature_only_for_indexing = pd.Series(list(G.nodes())) 
        y_label = pd.Series(labels)
        X_train, X_test, y_train, y_test = train_test_split(X_feature_only_for_indexing, 
                                                            y_label,
                                                            test_size=0.30, 
                                                            random_state=42)
#         print(X_train)
#         print(X_train.index)

        
        n_nodes = G.number_of_nodes()
        
        # create train and test masks for data
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[X_train.index] = True
        test_mask[X_test.index] = True
        data['train_mask'] = train_mask
        data['test_mask'] = test_mask
#         print(train_mask)
#         print(data['train_mask'])

        self.data, self.slices = self.collate([data])

    def _download(self):
        return

    def _process(self):
        return

    def __repr__(self):
        return '{}()'.format(self.__class__.__name__)
    
dataset = KarateDataset()
data = dataset[0]

In [4]:
data

Data(edge_index=[2, 156], num_nodes=34, x=[34, 1], y=[34], num_classes=2, train_mask=[34], test_mask=[34])

In [5]:
# print(data.train_mask)
# print(data.x) # will print the scaled embedding of size = 34X1
# print(data.y) # print labels

# Graph Convolutional Network.

In [6]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# GCN with 2 layers
class Net(torch.nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = GCNConv(data.num_features, 16)
        self.conv2 = GCNConv(16, int(data.num_classes))

    def forward(self):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data =  data.to(device)

model = Net().to(device)

# Train the GCN model

In [7]:

torch.manual_seed(42)

optimizer_name = "Adam"
lr = 1e-1
optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
epochs = 200

def train():
    model.train()
    optimizer.zero_grad()
    y_pred = model()[data.train_mask]
    y_lable = data.y[data.train_mask] 
    F.nll_loss(y_pred, y_lable).backward()
    optimizer.step()

@torch.no_grad()
def test():
    model.eval()
    logits = model()
    # for traning set
    mask1 = data['train_mask']
    pred1 = logits[mask1].max(1)[1]
    label1 = data.y[mask1]
    acc_train = pred1.eq(label1).sum().item() / mask1.sum().item()
    # for test set
    mask = data['test_mask']
    pred = logits[mask].max(1)[1]
    acc_test = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
    return acc_train,acc_test

for epoch in range(1, epochs):
    train()

train_acc,test_acc = test()

print('#' * 70)
print('Train Accuracy: %s' %train_acc )
print('Test Accuracy: %s' % test_acc)
print('#' * 70)

######################################################################
Train Accuracy: 0.782608695652174
Test Accuracy: 0.6363636363636364
######################################################################


# Reference:
https://towardsdatascience.com/a-beginners-guide-to-graph-neural-networks-using-pytorch-geometric-part-1-d98dc93e7742