# Graph Neural Networks Baseline

In [8]:
import numpy as np
import pickle
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn.functional as F
from torch.nn import Linear

In [3]:
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader

In [40]:
from sklearn.model_selection import StratifiedKFold

## Load data

In [5]:
DATA_FOLDER = '../data'
PICKLE_FOLDER = '../pickles'

In [6]:
with open(f'{PICKLE_FOLDER}/timeseries.pickle', 'rb') as f:
    ts = pickle.load(f)

In [17]:
total_samples, total_brain_regions, ts_length = ts.shape

In [9]:
df_metadata = pd.read_csv(f'{DATA_FOLDER}/patients-cleaned.csv', index_col=0)

In [10]:
df_metadata.head(2)

Unnamed: 0,age,sex,target
0,24.75,1,0
1,27.667,1,0


### Select connectivity dataset

In [11]:
THRESHOLD = 0.1
EDGE_FEATURES = 'binary'
CORR_TYPE = 'pearson'

In [12]:
with open(f'{PICKLE_FOLDER}/fc-{CORR_TYPE}-th-{THRESHOLD}-{EDGE_FEATURES}.pickle', 'rb') as f:
    edge_index_matrix = pickle.load(f)

In [13]:
edge_index_matrix.shape

(190, 90, 90)

## Split data

In [14]:
with open(f'{PICKLE_FOLDER}/test-indices.pickle', 'rb') as f:
    test_indices = pickle.load(f)

In [18]:
train_indices = ~np.isin(np.arange(total_samples), test_indices)

In [19]:
edge_index_matrix_train = edge_index_matrix[train_indices]
node_features_train = ts[train_indices]
y_train = df_metadata.iloc[train_indices]["target"].reset_index(drop=True)

In [20]:
print('Training data')
print(f'Edge matrix: {edge_index_matrix_train.shape}')
print(f'Node features: {node_features_train.shape}')
print(f'Target: {y_train.shape}')

Training data
Edge matrix: (140, 90, 90)
Node features: (140, 90, 400)
Target: (140,)


## Prepare data

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [43]:
dataset = np.array([Data(
    x=torch.from_numpy(x).type(dtype=torch.long).to(device), 
    edge_index=torch.from_numpy(np.asarray(np.nonzero(edge_index_matrix_train[0]))).to(device)
) for x in ts])

## Define model

In [37]:
class BaselineGCN(torch.nn.Module):
    
    def __init__(self, hidden_channels):
        super(BaselineGCN, self).__init__()
        torch.manual_seed(42)
        self.conv1 = GCNConv(ts_length, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)

        self.fc1 = Linear(hidden_channels, 2)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.softmax(self.fc1(x))
        
        return x

In [38]:
net = BaselineGCN(hidden_channels=64)
net.to(device)

BaselineGCN(
  (conv1): GCNConv(400, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (fc1): Linear(in_features=64, out_features=2, bias=True)
)

## Train model

In [41]:
skf = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)

In [None]:
for kfold, (train_index, val_index) in enumerate(skf.split(np.zeros(total_samples), y_train)):
    
    # Init model.
    net = BaselineGCN(hidden_channels=64).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    # Prepare data.
    X_train = dataset[train_index]
    X_val = dataset[val_index]
    
    trainloader = torch.utils.data.DataLoader(X_train, batch_size=4, shuffle=True)
    valloader = torch.utils.data.DataLoader(X_val, batch_size=4, shuffle=False)

    # Train.
    epoch_losses = []
    for epoch in range(EPOCHS):
        running_loss = 0.
        epoch_size = 0
        for data in trainloader:

            optimizr.zero_grad()
            outputs = net.forward(images)
            loss = criterion(outputs, labels.squeeze(-1).to(torch.long))
            loss.backward()

            optimizr.step()

            running_loss += loss.item()
            epoch_size += 1

        epoch_losses.append(running_loss / epoch_size)

    # Plot training loss.
    plt.plot(range(EPOCHS), epoch_losses)
    plt.show()
    
    # Evaluate fold.
    tp, tn, fp, fn = 0, 0, 0, 0
    total = 0

    with torch.no_grad():
        for data in valloader:
            outputs = net(images)

            predicted = torch.where(outputs[:,0] < outputs[:,1], 1, 0)   # outputs.view(-1) > 0.5

            total += labels.size(0)

            labels = labels.view(-1)

            pred_positives = predicted == 1
            label_positives = labels == 1

            tp += (pred_positives & label_positives).sum().item()
            tn += (~pred_positives & ~label_positives).sum().item()
            fp += (pred_positives & ~label_positives).sum().item()
            fn += (~pred_positives & label_positives).sum().item()

    print('===========================================')
    print(f'Accuracy: {(tp + tn) / total * 100:.2f} %')
    print(f'Precision: {tp / (tp + fp) * 100:.2f} %')
    print(f'Recall: {tp / (tp + fn) * 100:.2f} %')
    print(f'TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}')
    print('===========================================')

    print(f'Finished fold #{kfold}\n')

print('Finished training')

In [48]:
model = BaselineGCN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
        data.to(device)
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.



RuntimeError: result type Float can't be cast to the desired output type Long

In [None]:
def test(loader):
    model.eval()

    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 201):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')