In [1]:
import os
import numpy as np
from typing import List, Tuple, Dict, Optional
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
import warnings

warnings.filterwarnings("ignore")

In [2]:
data_path = r'D:\data\benchmark\cora'

label_dict = {'Case_Based':0,
              'Genetic_Algorithms':1,
              'Neural_Networks':2,
              'Probabilistic_Methods':3,
              'Reinforcement_Learning':4,
              'Rule_Learning':5,
              'Theory':6}

In [3]:
def construct_data(data_path:str,
                   label_dict:Dict[str,int]=label_dict,
                   train_size: float=1.0,
                   val_size: float=0.0,
                   test_size: float=0.0,
                   random_seed: int=42)->torch_geometric.data.Data:
        
    '''
    :param data_path: cora data path
    :param label_dict: cora data label
    :param train_size: train sample size
    :param val_size: validation sample size
    :param test_size: test sample size
    :param random_seed: random seed
    
    :return: torch_geometric data
    '''
    
    # load edges
    with open(os.path.join(data_path, 'cora.cites')) as f:
        cites = f.readlines()
    # load node features
    with open(os.path.join(data_path, 'cora.content')) as f:
        contents = f.readlines()
    
    cites = sorted([list(map(int,i.strip().split('\t'))) for i in cites])
    contents_preprocessed = []
    for doc in contents:
        doc_preprocessed = []
        doc_split = doc.strip().split('\t')
        for idx, token in enumerate(doc_split):
            if idx < len(doc_split)-1:
                doc_preprocessed.append(int(token))
            else:
                label = label_dict.get(token)
                doc_preprocessed.append(label)

        contents_preprocessed.append(doc_preprocessed)

    contents = sorted(contents_preprocessed, key=lambda x: x[0])
    
    id_unique = set([i[0] for i in contents])
    id_map = {a:i for i,a in enumerate(sorted(list(id_unique)))}
    
    cites_num = 0
    node_features, node_labels, edges = [], [], []
    
    for idx,content in enumerate(contents):

        paper_id = content[0]
        # first value = paper id, last value = class label
        doc_attrs = content[1:-1]
        class_label = content[-1]
        
        while paper_id != cites[cites_num][0]:
            
            edge_pair = []
            edge_pair.append(id_map.get(cites[cites_num][0]))
            edge_pair.append(id_map.get(cites[cites_num][1]))
            edges.append(edge_pair)
            # undirected
            edge_pair = []
            edge_pair.append(id_map.get(cites[cites_num][1]))
            edge_pair.append(id_map.get(cites[cites_num][0]))
            edges.append(edge_pair)
            # break loop
            if cites_num == len(cites)-1:
                break
            cites_num += 1
        
        node_features.append(doc_attrs)
        node_labels.append(class_label)
    
    # train, val, test split
    total_nodes = list(range(len(contents)))
    random.seed(random_seed)
    random.shuffle(total_nodes)
    train_idx, val_idx, test_idx = np.split(total_nodes, 
                                            [int(train_size*len(total_nodes)), 
                                             int((train_size+val_size)*len(total_nodes))])
        
    data = torch_geometric.data.Data(x=torch.tensor(node_features, dtype=torch.float),
                                     y=torch.tensor(node_labels, dtype=torch.long),
                                     edge_index=torch.tensor(torch.tensor(edges)).T,
                                     train_idx=train_idx,
                                     val_idx=val_idx,
                                     test_idx=test_idx)
    
    return data

In [4]:
class GCNModel(nn.Module):
    def __init__(self, 
                 input_dim: int, 
                 hidden_dim: int, 
                 output_dim: int, 
                 training: bool):
        super(GCNModel, self).__init__()
        
        self.convs = nn.ModuleList()
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        self.lns = nn.ModuleList()
        for l in range(1):
            self.lns.append(nn.LayerNorm(hidden_dim))
        self.training = training
        
        for l in range(2):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))
            
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(0.3),
            nn.Linear(hidden_dim, output_dim))
        
        self.dropout = 0.3
        self.num_layers = 2
        
    def build_conv_model(self,
                         input_dim: int,
                         hidden_dim: int):
        return GCNConv(input_dim, hidden_dim)
    
    def forward(self, 
                data: torch_geometric.data.Data):
        
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if data.num_node_features == 0:
            x = torch.ones(data.num_nodes, 1)
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training=self.training)
            if not i == self.num_layers - 1:
                x = self.lns[i](x)
        
        
        x = self.post_mp(x)
        
        return x
    
    def loss(self, pred, label):
        return F.cross_entropy(pred, label)

# training

In [5]:
device = 'cpu'

data = construct_data(data_path, train_size=0.7, val_size=0.2, test_size=0.1)
total_size = data.x.shape[0]
train_size = len(data.train_idx)
val_size = len(data.val_idx)
# input dim : dimension of node features
model = GCNModel(input_dim=1433, hidden_dim=128, output_dim=7, training=True).to(device)

train_dataloader = torch_geometric.loader.DataLoader([data],batch_size=total_size,shuffle=True)
val_dataloader = torch_geometric.loader.DataLoader([data],batch_size=total_size,shuffle=False)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(20):
    epoch_loss = 0
    optimizer.zero_grad()
    
    for i,train_batch in enumerate(train_dataloader):
        train_batch = train_batch.to(device)
        pred = model(train_batch)
        label = train_batch.y
        loss = model.loss(pred[data.train_idx], label[data.train_idx])
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        print('Epoch {} train loss : {:.8f}'.format(epoch+1, epoch_loss))
    
    if (epoch+1)%5==0:
        model.eval()
        for i,val_batch in enumerate(val_dataloader):
            val_batch = val_batch.to(device)
            with torch.no_grad():
                pred = model(val_batch)
                label = val_batch.y
                loss = model.loss(pred[data.val_idx], label[data.val_idx])
                model_score = f1_score(y_true = label[data.val_idx],
                                       y_pred = pred.argmax(1)[data.val_idx],
                                       average = 'micro')
        print('===============================')
        print('Epoch {} validation loss : {:.4f}'.format(epoch+1, loss.item()))
        print('Epoch {} f1 score : {:.4f}'.format(epoch+1, model_score))
        print('===============================')

Epoch 1 train loss : 1.90685487
Epoch 2 train loss : 1.78706861
Epoch 3 train loss : 1.71203530
Epoch 4 train loss : 1.62397397
Epoch 5 train loss : 1.53358936
Epoch 5 validation loss : 1.3484
Epoch 5 f1 score : 0.4557
Epoch 6 train loss : 1.32751226
Epoch 7 train loss : 1.20975363
Epoch 8 train loss : 1.09696209
Epoch 9 train loss : 0.98926634
Epoch 10 train loss : 0.88467997
Epoch 10 validation loss : 0.8269
Epoch 10 f1 score : 0.8247
Epoch 11 train loss : 0.78524959
Epoch 12 train loss : 0.69512892
Epoch 13 train loss : 0.61588442
Epoch 14 train loss : 0.54536206
Epoch 15 train loss : 0.48194444
Epoch 15 validation loss : 0.4872
Epoch 15 f1 score : 0.8764
Epoch 16 train loss : 0.42626092
Epoch 17 train loss : 0.37861359
Epoch 18 train loss : 0.33798730
Epoch 19 train loss : 0.30307183
Epoch 20 train loss : 0.27289969
Epoch 20 validation loss : 0.3538
Epoch 20 f1 score : 0.8948
