# Use Graph Convolution Networks to Identify Patient Disease Based on Symptomology

Dataset: https://arxiv.org/abs/2205.09148 -> download release_conditions.json

Tutorials followed:

https://medium.com/cj-express-tech-tildi/first-timers-guide-to-pytorch-geometric-part-1-the-basic-1b6006e1f4db 

https://towardsdatascience.com/a-beginners-guide-to-graph-neural-networks-using-pytorch-geometric-part-1-d98dc93e7742 

https://towardsdatascience.com/a-beginners-guide-to-graph-neural-networks-using-pytorch-geometric-part-2-cd82c01330ab

In [51]:
# basic imports
import numpy as np
import pickle
import json
from pathlib import Path
import pandas as pd
from copy import deepcopy
import os.path as osp

# torch imports
import torch
import torch.nn.functional as F
import torch.nn as nn

# torch geometric imports
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import add_self_loops, degree, from_networkx, to_networkx
from torch_geometric.data import InMemoryDataset, download_url, Data
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv

# sklearn imports
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# graph-specific imports
import networkx as nx
from GraphEmbedding.ge import DeepWalk


Using GCNConv from torch.nn 

** note - removed dropout layer between nonlinear activation layer (F.relu) and second convolutional layer (self.conv2)
The dropout layer is usually implemented in dense convolution networks when it is computationally expensive to fit all possible neural networks to a given dataset. This dataset is smaller and does not need dropout (performance drops as network thins)


In [52]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.num_features, 16)
        self.conv2 = GCNConv(16, int(data.num_classes))

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [53]:
# Load the initial dataset of disease and phenotypic data

f = open('/gstore/scratch/u/hysii/nlp/original_dataset')
json_data = json.load(f)


### Only focusing on diseases and symptoms, some symptoms repeat across multiple diseases -> focus on unique symptomology that distinguishes each disease 

first, extract all symptoms

In [54]:
all_sympts = []

for key, other_info in json_data.items():
    for s in other_info['symptoms'].keys():
        all_sympts.append(s)


next, determine which symptoms never occur for more than one disease/condition

In [55]:
unique_sympts = []
for i, sympt in enumerate(all_sympts):
    counter = 0
    
    for idx, s in enumerate(all_sympts):
        if sympt == s and i != idx:
            counter += 1
    
    if counter == 0:
        unique_sympts.append(sympt)

create a dictionary that maps each disease/condition to their unique symptomology

In [56]:
path_to_symptoms = {}

for key, other_info in json_data.items():
    symp_lst = []
    for s in unique_sympts:
        if s in other_info['symptoms'].keys():
            print(s)
            symp_lst.append(s)
    
    if len(symp_lst) != 0:
        path_to_symptoms[key] = symp_lst

larmes
vo_violent
ww_bouger
pyrosis
ww_bouffe
pale
rectorragie
ménorr
ballon_abdo
ww_valsalva
obstipation
dysarthrie
diplopie
fatigabilité_msk
claud_mâchoire
posttus_emesis
insp_siffla
contact_allergie
bw_bending
faiblesse faciale
footnumb
paralysie_visage
toux_Aboy
pls_irreg
prurit_occ
prurit_nasal
rds_anorexie
flushing
dysp_effort
laryngospasme
spasmes_msk
trismus
spasme_trapeze
protu_langue
regard_dévié
gain_poids
ulcères_bouche
angor_accelere
rds_sg
confusion
etouff
psy_depers
impression_mort
boire_ped
apnee
erytheme_occ
convulsion
pertes_vag
selles_pale


create a new dataset to rely on with only the unique symptoms

In [11]:
with open('/gstore/scratch/u/hysii/nlp/unique_symptoms.json', 'w') as f:
    json.dump(path_to_symptoms, f)

In [57]:
with open('/gstore/scratch/u/hysii/nlp/unique_symptoms.json', 'r') as f:
    data = json.load(f)

#### Using networkx's builtin graph functionality, create a graph from our updated json file and store it in a pickle file

In [58]:
G = nx.Graph(data)

with open('/gstore/scratch/u/hysii/nlp/graph.pckl', 'wb') as f:
    graph = pickle.dump(G, f)

#### Using DeepWalk, generate node embeddings that are more informed based on vertex representations in the latent space

In [59]:
# train the model and generate embeddings
model = DeepWalk(G, walk_length=5, num_walks=15, workers=1)
model.train(window_size=5,iter=3)

embeddings = model.get_embeddings()
embeddings = np.stack(embeddings.values())

Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  if (await self.run_code(code, result,  async_=asy)):


To identify diseases based on symptoms, generate an array of labels where each disease corresponds to a label

list of nodes = ['Flu', 'cough', 'sore throat', 'Pink eye', 'redness', 'itchiness']

(Flu, cough, sore throat) -> label 0

(pink eye, redness, itchiness) -> label 1

list of labels = [0, 0, 0, 1, 1, 1]

In [60]:
labels = []

for idx, (key, u_sympts) in enumerate(path_to_symptoms.items()):
    labels.append(idx)
    for s in u_sympts:
        labels.append(idx)

labels = np.array(labels)

## Generate a Functional Custom Dataset Using Previous Embeddings and Labels 

In [77]:
class MyDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MyDataset, self).__init__(root, transform, None, None)

        # turn networkx graph into a torch.data object 
        data = from_networkx(G)

        data.num_nodes = G.number_of_nodes()
        
        # embedding 
        data.x = torch.from_numpy(embeddings).type(torch.float32)
        
        # labels
        y = torch.from_numpy(labels).type(torch.long)
        data.y = y.clone().detach()
        
        # number of classes/labels = number of diseases
        data.num_classes = len(path_to_symptoms.keys())

        # splitting the data into train and test
        X_train, X_test, y_train, y_test = train_test_split(pd.Series(list(G.nodes)), 
                                                            pd.Series(labels),
                                                            test_size=0.30, 
                                                            random_state=42)
        
        n_nodes = G.number_of_nodes()
        # create train and test masks for data
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[X_train.index] = True
        test_mask[X_test.index] = True
        data['train_mask'] = train_mask
        data['test_mask'] = test_mask

        self.data, self.slices = self.collate([data])
        
    def _download(self):
        return

    def _process(self):
        return

    def __repr__(self):
        return '{}()'.format(self.__class__.__name__)

In [78]:
dataset = MyDataset(Path('/gstore/scratch/u/hysii/nlp/graph.pckl'))
data = dataset[0]

#### Set The Model To Train On To The Graph Convolution Network

In [79]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data =  data.to(device)
model = Net().to(device) 

Implement training and apply to test set to output accuracy scores

In [76]:
torch.manual_seed(42)

optimizer_name = "Adam"
lr = 1e-1
optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
epochs = 200

def train():
  model.train()
  optimizer.zero_grad()
  F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
  optimizer.step()

@torch.no_grad()
def test():
  model.eval()
  logits = model()
  mask1 = data['train_mask']
  pred1 = logits[mask1].max(1)[1]
  acc1 = pred1.eq(data.y[mask1]).sum().item() / mask1.sum().item()
  mask = data['test_mask']
  pred = logits[mask].max(1)[1]
  acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
  return acc1,acc

for epoch in range(1, epochs):
  train()

train_acc,test_acc = test()

print('#' * 70)
print('Train Accuracy: %s' %train_acc )
print('Test Accuracy: %s' % test_acc)
print('#' * 70)

######################################################################
Train Accuracy: 0.8918918918918919
Test Accuracy: 0.23684210526315788
######################################################################


Future direction:

Generate a dataset that incorporates genetic variants that may contribute to each disease

Implement GraphSAGE and GAT to compare against