### So, let's begin our graph learning trip from the easy sample (Kalate club dataset)

### First, get and deal the data to unified format 

1. import library 

2. deal to the standard format

<34x34 sparse matrix of type '<class 'numpy.intc'>'
	with 156 stored elements in COOrdinate format>

In [69]:
import torch
import torch_geometric
import pygod
import networkx as nx
import numpy as np
from sklearn.preprocessing import StandardScaler

G = nx.karate_club_graph()
labels = np.asarray([G.nodes[i]['club'] != 'Mr. Hi' for i in G.nodes]).astype(np.int64)
adj = nx.to_scipy_sparse_matrix(G).tocoo()
row = torch.from_numpy(adj.row.astype(np.int64)).to(torch.int64)
col = torch.from_numpy(adj.col.astype(np.int64)).to(torch.int64)
edge_index = torch.stack([row,col],dim=0)

embeddings = np.array(list(dict(G.degree()).values()))
scale = StandardScaler()
embeddings = scale.fit_transform(embeddings.reshape(-1,1))

### Second,

1. using the *InMemoryDataset* limiting the random data to a standard dataset



In [70]:
import torch
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import torch_geometric.transforms as T

# custom dataset
class KarateDataset(InMemoryDataset):
    def __init__(self, transform=None):
        super(KarateDataset, self).__init__('.', transform, None, None)

        data = Data(edge_index=edge_index)
        
        data.num_nodes = G.number_of_nodes()
        
        # embedding 
        data.x = torch.from_numpy(embeddings).type(torch.float32)
        
        # labels
        y = torch.from_numpy(labels).type(torch.long)
        data.y = y.clone().detach()
        
        data.num_classes = 2

        # splitting the data into train, validation and test
        X_train, X_test, y_train, y_test = train_test_split(pd.Series(list(G.nodes())), 
                                                            pd.Series(labels),
                                                            test_size=0.30, 
                                                            random_state=42)
        
        n_nodes = G.number_of_nodes()
        
        # create train and test masks for data
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[X_train.index] = True
        test_mask[X_test.index] = True
        data['train_mask'] = train_mask
        data['test_mask'] = test_mask

        self.data, self.slices = self.collate([data])

    def _download(self):
        return

    def _process(self):
        return

    def __repr__(self):
        return '{}()'.format(self.__class__.__name__)
    
dataset = KarateDataset()
data = dataset[0]


### Third, It's time showing the power of standard dataset.

1. build GNN network


In [131]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self,input_dim = data.num_features,output_dim = data.num_classes):
        super(Net,self).__init__()
        # Two layers receptive field
        self.conv1 = GCNConv(input_dim,16)
        self.conv2 = GCNConv(16,output_dim)
    
    def forward(self,data):
        x, edge_index= data.x, data.edge_index
        x = F.leaky_relu(self.conv1(x, edge_index))
        x = F.dropout(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x,dim=1)

model = Net()
model.to(device="cuda" if torch.cuda.is_available() else "cpu")
model.train()

Net(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)

In [132]:
from tqdm import tqdm
torch.manual_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
lr = 1e-2
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
epochs = 2000
def train():
    for i in tqdm(range(epochs)):
        optimizer.zero_grad()
        output = model(data.to(device=device))
        F.nll_loss(model(data)[data.train_mask],data.y[data.train_mask]).backward()
        optimizer.step()


@torch.no_grad()
def test():
    model.eval()
    logits = model(data.to(device))
    mask1 = data["train_mask"]
    pred1 = logits[mask1].max(1)[1]
    # print (f"logits[mask1].max(1):{logits[mask1].max(1)}")
    # max(1) mean to find max in dimension one. And have the max value and the max index 
    acc1 = pred1.eq(data.y[mask1]).sum().item() / mask1.sum().item()
    print (f"Train_mask accuracy: {pred1.eq(data.y[mask1]).sum().item()}/{mask1.sum().item()} || {round(acc1,2)}")
    
    mask2 = data["test_mask"]
    pred2 = logits[mask2].max(1)[1]
    # print (f"logits[mask2].max(1):{logits[mask1].max(1)}")
    acc2 = pred2.eq(data.y[mask2]).sum().item() / mask2.sum().item()
    print (f"Test_mask accuracy: {pred2.eq(data.y[mask2]).sum().item()}/{mask2.sum().item()} || {round(acc2,2)}")

train()

100%|██████████| 2000/2000 [00:30<00:00, 65.65it/s]


In [133]:
test()

Train_mask accuracy: 21/23 || 0.91
Train_mask accuracy: 8/11 || 0.73


Issues in the experiment: all variables are just name, cite. Just like below.

In [81]:
x = "12"
print (id(x))
x = "23"
print (id(x))

x = "12"
print (id(x))
x = "23" + x
print (id(x))

2682695633200
2682695474160
2682695633200
2682695517552
