# Graph Neural Networks Baseline

In [1]:
import numpy as np
import pickle
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn.functional as F
from torch.nn import Linear

In [3]:
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader

In [4]:
from sklearn.model_selection import StratifiedKFold

## Load data

In [5]:
DATA_FOLDER = '../data'
PICKLE_FOLDER = '../pickles'

In [6]:
with open(f'{PICKLE_FOLDER}/timeseries.pickle', 'rb') as f:
    ts = pickle.load(f)

In [7]:
total_samples, total_brain_regions, ts_length = ts.shape

In [8]:
df_metadata = pd.read_csv(f'{DATA_FOLDER}/patients-cleaned.csv', index_col=0)

In [9]:
df_metadata.head(2)

Unnamed: 0,age,sex,target
0,24.75,1,0
1,27.667,1,0


### Select connectivity dataset

In [13]:
THRESHOLD = 0.1                         # 0.01, 0.05, 0.1, 0.15
EDGE_FEATURES = 'binary'                # 'binary', 'real'
CORR_TYPE = 'pearson'                   # 'pearson', 'spearman', 'partial-pearson'
THRESHOLD_METHOD = 'abs-sample-diff'    # 'abs-sample-diff', 'abs-group-avg-diff'
THRESHOLD_TYPE = 'min'                  # 'min', 'max'

In [14]:
fc_file = f'{PICKLE_FOLDER}/fc-{CORR_TYPE}-{THRESHOLD_METHOD}/{THRESHOLD_TYPE}-th-{THRESHOLD}-{EDGE_FEATURES}.pickle'

In [15]:
with open(fc_file, 'rb') as f:
    edge_index_matrix = pickle.load(f)

In [16]:
edge_index_matrix.shape

(190, 90, 90)

## Split data

In [17]:
with open(f'{PICKLE_FOLDER}/test-indices.pickle', 'rb') as f:
    test_indices = pickle.load(f)

In [18]:
train_indices = list(set(range(total_samples)) - set(test_indices))

In [57]:
train_targets = df_metadata.iloc[train_indices]["target"].reset_index(drop=True)

In [91]:
print(f'Train set size: {len(train_indices)}')
print(f'Test set size: {len(test_indices)}')

Train set size: 140
Test set size: 50


## Extend dataset

In [58]:
N_SURROGATES = 5                    # 5
SURROGATE_METHOD = 'iaaft'          # 'iaaft', 'aaft'

In [59]:
with open(f'{PICKLE_FOLDER}/timeseries-{SURROGATE_METHOD}-{N_SURROGATES}.pickle', 'rb') as f:
    ts_surrogates = pickle.load(f)

In [81]:
print(f'Extra data: {ts_surrogates.shape}')
print(f'{ts_surrogates.shape[0] / total_samples} surrogates per sample')

Extra data: (950, 90, 400)
5.0 surrogates per sample


Extend only training data. Test set will consist of true data only.

In [93]:
ts_train_surrogates = np.concatenate([ts_surrogates[N_SURROGATES*i:N_SURROGATES*i+N_SURROGATES] for i in train_indices], axis=0)

In [96]:
ts_train_surrogates.shape

(700, 90, 400)

## Prepare data

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### `Data` object fields

- `data.x`: Node feature matrix with shape `[num_nodes, num_node_features]`

- `data.edge_index`: Graph connectivity in COO format with shape `[2, num_edges]` and type `torch.long`

- `data.edge_attr`: Edge feature matrix with shape `[num_edges, num_edge_features]`

- `data.y`: Target to train against (may have arbitrary shape), e.g., node-level targets of shape `[num_nodes, *]` or graph-level targets of shape `[1, *]`

- `data.pos`: Node position matrix with shape `[num_nodes, num_dimensions]`

In [21]:
dataset = [Data(
    x=torch.from_numpy(ts[i]).to(torch.float32),  
    edge_index=torch.from_numpy(np.asarray(np.nonzero(edge_index_matrix[i]))).to(torch.int64),
    y=torch.tensor([target], dtype=torch.int64)
).to(device) for target, i in zip(train_targets, train_indices)]

In [102]:
print(f'True train data: {len(dataset)}')

True train data: 840


In [97]:
# Extension.
dataset_ext = [Data(
    x=torch.from_numpy(ts_surr).to(torch.float32),  
    edge_index=torch.from_numpy(np.asarray(np.nonzero(edge_index_matrix[i]))).to(torch.int64),
    y=torch.tensor([target], dtype=torch.int64)
).to(device) for target, i, ts_surr in zip(
    np.repeat(train_targets, N_SURROGATES),
    np.repeat(train_indices, N_SURROGATES),
    ts_train_surrogates
)]

In [103]:
print(f'Surrogate train data: {len(dataset_ext)}')

Surrogate train data: 700


In [100]:
dataset.extend(dataset_ext)

In [104]:
print(f'Extended train data: {len(dataset)}')

Extended train data: 840


In [105]:
print('Data object')
print(f'Edge index: {dataset[0].edge_index.shape}')
print(f'Node features: {dataset[0].x.shape}')
print(f'Target: {dataset[0].y.shape}')

Data object
Edge index: torch.Size([2, 6044])
Node features: torch.Size([90, 400])
Target: torch.Size([1])


## Define model

In [124]:
class BaselineGCN(torch.nn.Module):
    
    def __init__(self, hidden_channels):
        super(BaselineGCN, self).__init__()
        torch.manual_seed(42)
        self.conv1 = GCNConv(ts_length, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)

        self.fc1 = Linear(hidden_channels, 2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc1(x)     # CELoss already incorporates `log_softmax`.
        
        return x

In [125]:
net = BaselineGCN(hidden_channels=8)
net.to(device)

BaselineGCN(
  (conv1): GCNConv(400, 8)
  (conv2): GCNConv(8, 8)
  (conv3): GCNConv(8, 8)
  (fc1): Linear(in_features=8, out_features=2, bias=True)
)

## Train model

In [126]:
skf = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)

In [127]:
EPOCHS = 1000

In [128]:
for kfold, (train_index, val_index) in enumerate(skf.split(np.zeros(len(train_targets)), train_targets)):
    
    # Init model.
    net = BaselineGCN(hidden_channels=64).to(device)
    optimizr = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.5)
    criterion = torch.nn.CrossEntropyLoss()
    net.train()

    # Prepare data.
    X_train = [dataset[i] for i in train_index]
    X_val = [dataset[i] for i in val_index]
    
    trainloader = DataLoader(X_train, batch_size=4, shuffle=True)
    valloader = DataLoader(X_val, batch_size=4, shuffle=False)

    # Train.
    epoch_losses = []
    for epoch in range(EPOCHS):
        running_loss = 0.
        epoch_size = 0
        
        for data in trainloader:
            optimizr.zero_grad()

            outputs = net(data)
            
            loss = criterion(outputs, data.y)  # Compute the loss.
            loss.backward()  # Derive gradients.
            optimizr.step()

            running_loss += loss.item()
            epoch_size += 1

        epoch_losses.append(running_loss / epoch_size)

    # Plot training loss.
    plt.plot(range(EPOCHS), epoch_losses)
    plt.show()
    
    # Evaluate fold.
    tp, tn, fp, fn = 0, 0, 0, 0
    total = 0

    with torch.no_grad():
        net.eval()

        for data in valloader:
            outputs = net(data)

            predicted = outputs.argmax(dim=1)

            total += data.y.size(0)

            labels = data.y.view(-1)

            pred_positives = predicted == 1
            label_positives = labels == 1

            tp += (pred_positives & label_positives).sum().item()
            tn += (~pred_positives & ~label_positives).sum().item()
            fp += (pred_positives & ~label_positives).sum().item()
            fn += (~pred_positives & label_positives).sum().item()

    print('===========================================')
    print(f'Accuracy: {(tp + tn) / total * 100:.2f} %')
    print(f'Precision: {tp / (tp + fp) * 100:.2f} %')
    print(f'Recall: {tp / (tp + fn) * 100:.2f} %')
    print(f'TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}')
    print('===========================================')

    print(f'Finished fold #{kfold}\n')

print('Finished training')

tensor([[-3.5022e-02,  3.5410e-02, -4.1006e-02,  2.3849e-02, -4.2094e-02,
          7.1125e-02, -2.5770e-03, -6.5332e-02, -5.5606e-02,  4.6088e-03,
          2.1107e-02,  4.5986e-02,  1.9869e-02, -3.6340e-02, -1.1291e-04,
         -3.8757e-02,  1.2293e-04,  7.2001e-03,  2.1045e-02,  4.8760e-02,
          9.3344e-03,  1.6418e-02,  3.1810e-02,  3.6885e-02,  2.2806e-02,
         -3.0740e-02,  5.9031e-02, -5.8547e-02,  2.1661e-02, -2.1815e-02,
          5.2773e-02, -5.6954e-02, -5.9659e-03,  5.2666e-02,  4.8222e-02,
         -1.1074e-02, -6.7001e-03,  4.6509e-03, -2.1060e-02,  5.4291e-03,
          1.6698e-02,  7.2888e-02, -7.6186e-02,  9.5884e-03,  6.4558e-02,
          7.9487e-02, -4.9284e-03, -2.5523e-02, -2.3667e-02,  9.1258e-04,
         -8.1411e-02, -1.2315e-02, -6.9843e-02, -1.2384e-02,  7.7733e-03,
          9.3262e-03, -1.4273e-02, -1.1894e-02,  2.9109e-02,  4.4057e-02,
          2.4401e-02,  9.1332e-03,  1.2144e-02,  1.9471e-02],
        [ 1.3108e-02, -1.2711e-02,  2.5043e-02,  5

AttributeError: 'int' object has no attribute 'dropout'