# Lung cancer classification with Graph Convolutional Networks

In [None]:
import os
if not os.path.exists("README.md"):
    os.chdir("../")

import pandas as pd
import numpy as np

import networkx as nx
import matplotlib.pyplot as plt

# for the ML part
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

# for the graph part
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import to_networkx, from_networkx

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

from collections import Counter

from scripts.gcn import GCN, train, test, train_loop
BATCH_SIZE = 16


## Data preprocessing

### import dataset from csv files


In [None]:
degenes = pd.read_csv('./data/final/degenes.csv', index_col=0)
pdata = pd.read_csv('./data/final/pdata_nan_filled.csv', index_col=0)

degenes_t = degenes.T
degenes_t.columns = [x.split('///')[0] for x in degenes_t.columns]
degenes = degenes_t.T
degenes = degenes/10
degenes_t = degenes.T/10

matrix = pd.read_csv('data/final/adj_matrix.csv', index_col=0)
matrix = matrix.drop('cancer_status', axis=1).drop('cancer_status', axis=0)

degenes_t.head(3)

In [None]:
#drop nan in pdata
pdata = pdata.dropna(axis=0)
pdata

### MinMaxScaler and StandardScaler pass

In [None]:
# min max scale
degenes_scaled = pd.DataFrame(degenes, index=degenes.index, columns=degenes.columns)
#degenes_scaled = degenes_scaled.applymap(lambda x: np.exp(x))

scaler = StandardScaler()
degenes_scaled = pd.DataFrame(scaler.fit_transform(degenes_scaled), index=degenes_scaled.index, columns=degenes_scaled.columns)

mmscaler = MinMaxScaler()
degenes_scaled = pd.DataFrame(mmscaler.fit_transform(degenes_scaled), index=degenes_scaled.index, columns=degenes_scaled.columns)

# minmax pdata
pdata_scaled = pd.DataFrame(pdata, index=pdata.index, columns=pdata.columns)
pdata_scaled = pd.DataFrame(scaler.fit_transform(pdata_scaled), index=pdata_scaled.index, columns=pdata_scaled.columns)
pdata_scaled = pd.DataFrame(mmscaler.fit_transform(pdata_scaled), index=pdata_scaled.index, columns=pdata_scaled.columns)

degenes_scaled.head(3)

In [None]:
pdata_scaled.head(3)

## Building graph structure from adjacency matrix

In [None]:
graphs = {}

for i in range(0, len(degenes_scaled.columns)):
    G = nx.from_pandas_adjacency(matrix)
    G.remove_nodes_from(list(nx.isolates(G)))
    nx.set_node_attributes(G, degenes_scaled.iloc[:,i].to_dict(), 'x')

    for edge in G.edges:
        G.edges[edge]['weight'] = 1

    graphs[degenes_scaled.columns[i]] = G

### Create pytorch graph structure

In [None]:
# create a geometric data object from the networkx for each graph
data_list = []
for key, value in graphs.items():
    try:
        cs = pdata.loc[key, 'cancer_status']

        d = from_networkx(value)
        d.x = torch.tensor([d[1]['x'] for d in value.nodes(data=True)], dtype=torch.float32)
        d.x = d.x.view(-1, 1)

        target = torch.tensor([[0, 1]], dtype=torch.float32) if cs == 1 else torch.tensor([[1, 0]], dtype=torch.float32)
        additional_features = pdata.loc[key].drop(['cancer_status','subjective_assessment'], axis=0)
        additional_features = additional_features.to_frame().T
        additional_features = additional_features.astype('float32')

        d.y = [target, torch.tensor(additional_features.values)]

        data_list.append(d)
    except:
        KeyError

### Split into train and test

In [None]:
# split in train validation and test
train_data, test_data = train_test_split(data_list, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

len(train_data), len(test_data), len(val_data)

### Create torch DataLoaders

In [None]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
from collections import Counter
train_cancer_status = [torch.argmax(d.y[0]).item() for d in train_data]
val_cancer_status = [torch.argmax(d.y[0]).item() for d in val_data]
test_cancer_status = [torch.argmax(d.y[0]).item() for d in test_data]

print('train_data: ', Counter(train_cancer_status))
print('val_data: ', Counter(val_cancer_status))
print('test_data: ', Counter(test_cancer_status))

## Now, graph classification

In [None]:
# autoreload 
%load_ext autoreload
%autoreload 2
from scripts.gcn import GCN, EarlyStopping, train_loop, test, GCNClassifier

In [None]:
model = GCN(hidden_channels=256)
print(model)

# define training loop
device = torch.device("cpu")
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-3)
criterion = torch.nn.MSELoss()

In [None]:
train_loop(
    model, 
    criterion, 
    optimizer, 
    train_loader, 
    val_loader, 
    epochs=1000,
    early_stopping=EarlyStopping(patience=50, delta=0.0001, verbose=True),
    verbose=True
)

128 hidden units, 0.2 dropout, 1e-3 L2, 0.001 learning rate
Best train loss: 0.1018	Best val loss: 0.1436

In [None]:
acc, loss = test(test_loader, model, criterion)
print(f'Test Accuracy: {acc:.4f}')

In [None]:
loss, acc

In [None]:
dev_set = train_data.copy()
for x in val_data:
    dev_set.append(x)

dev_loader = DataLoader(dev_set, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model = GCN(hidden_channels=64)
print(model)
torch.manual_seed(1)

# define training loop
device = torch.device("cpu")
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-3)
criterion = torch.nn.MSELoss()

train_loop(
    model,
    criterion,
    optimizer,
    dev_loader,
    dev_loader,
    epochs=1000,
    verbose=True,
    min_loss = 0.08
)

In [None]:
# test on test set
acc, loss = test(test_loader, model, criterion)
print(f'Test Accuracy: {acc:.4f}')

### K-Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold

# autoreload 
%load_ext autoreload
%autoreload 2
from scripts.gcn import GCN, EarlyStopping, train_loop, test, GCNClassifier

# lucky config K=3, rs=43, 256 hidden, 0.01 lr, 1e-3 wd
kf = KFold(n_splits=3, shuffle=True, random_state=43)
accuracies =[]
losses = []
counter = 1
torch.manual_seed(42)
for train_index, test_index in kf.split(data_list):

    train_data = [data_list[i] for i in train_index]
    # append the noisy data to the train data

    train_data_noisy = train_data.copy()
    
    """
    for t in train_data_noisy:
        t.x = t.x + (0.5**0.5)*torch.randn(t.x.shape)

    for t in train_data_noisy:
        train_data.append(t)
    """

    test_data = [data_list[i] for i in test_index]

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

    model = GCN(hidden_channels=256)
    print(model)

    # define training loop
    device = torch.device("cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-3)
    criterion = torch.nn.MSELoss()

    train_loop(
        model,
        criterion,
        optimizer,
        train_loader,
        test_loader,
        epochs=1000,
        verbose=False,
        min_loss=0.1
    )
    model.eval()
    acc, loss = test(test_loader, model, criterion)
    accuracies.append(acc)
    losses.append(loss)
    print(f'> Fold {counter} trained. Test accuracy: {acc:.3f}\tTest loss {loss:.3f}')
    counter += 1

# print mean accuracy and loss
print('-'*20)
print('REPORT')
print(f'Mean accuracy: {np.mean(accuracies):.3f} ')
print(f'Mean loss: {np.mean(losses):.3f} ')
print('-'*20)