In [None]:
import os, sys, datetime, pickle
import networks.randomize_network as rn
from create_datasets import create_nx_datasets
import networkx as nx

**1. Create random networks**

Create 100 random networks for each randomization method ("Shuffled" and "Rewired") and for each gene set (AD and NDD).

In [None]:
diseases = ['AD', 'ND']

for disease in diseases:
    infile = f'data/{disease}_STRING_PPI_edgelist_biggest.txt'
    original = rn.load_graph(infile)

    for i in range(1, 101):
        shuffled = rn.shuffle_nodes(original)
        nx.write_edgelist(shuffled, f'data/random_networks/shuffled/{disease}_PPI_rand{i}_edgelist.txt')

        rewired = rn.generate_RDPN(infile)
        nx.write_edgelist(shuffled, f'data/random_networks/rewired/{disease}_PPI_rand{i}_edgelist.txt')


**2. Create graph datasets**

Create the correspoding graph-datasets for each random network. This takes times because the code builds 800 different datasets (2 randomisation methods x 2 gene sets x 100 random networks x 2 different targets)

In [None]:
dataset = 'ADNI'
targets = ['PET', 'PETandDX']
diseases = ['AD', 'ND']
networks = ['shuffled', 'rewired']

for target in targets:
    for disease in diseases:
        for network in networks:
            for i in range(1, 101):

                outdir = f'data/graph_datasets/{target}/{network}'

                start_time = datetime.datetime.now()
                print()

                result_nodes = create_nx_datasets.main('data', dataset, target, disease, network, 'missense', i)
                print('Coding: number of missense variants per node')

                outfile = f'{outdir}/{disease}_PPI_rand{i}_missense.pkl'
                print('Resulting dataset saved at:', outfile)
                print()

                with open(outfile, 'wb') as f:
                    pickle.dump(result_nodes, f)

                result_nodes_time = datetime.datetime.now()
                print('Processing time:', result_nodes_time - start_time)
                print('\n\n')

**3. Graph classification with GNNs**

We then evaluated and tested different GNNs in the framework called [GraphGym](https://github.com/snap-stanford/GraphGym) (You *et al.*, 2020).

Configuration and grid files employed are in the subdirectory [graphgym_files](graphgym_files). The models' configuration in this case was the same than the original GNNs.

Summarized results obtained by GraphGym and other models are in [results/GNN_comparison](results/GNN_comparison)

**4. Statistical analysis original performance *vs.* random performances**
We computed p-values with a 1-sample t-test comparing each original run against of all the random runs (100 random datasets x 3 runs = 300 runs) of each randomization method for each target and gene set.

For this, we previously extracted the performance values of each run. Because the huge size (GB) of all files produced by GraphGym, we summarized in one file all runs for each randomization method and classification task proposed. 

- Results for PET target with datasets using [Shuffled and Rewired methods](results/GNN_random_results/PET/)
- Results for PET&DX target with datasets using [Shuffled and Rewired methods](results/GNN_random_results/PETandDX/)

In [None]:
from scipy import stats
import pandas as pd

In [None]:
results_dict = {
    'AD_PET': [0.6898, 0.7180, 0.7294],
    'ND_PET': [0.7050, 0.6349, 0.7143],
    'AD_PETandDX': [0.6825, 0.7302, 0.7143],
    'ND_PETandDX': [0.7937, 0.8532, 0.6389]
}

In [None]:
def compute_pvalues(original_runs, infile):

    data = data = pd.read_csv(infile, index_col='random')
    data.drop(columns=['epoch'], inplace=True)

    all_list = data.values.tolist()
    all_runs = [item for sublist in all_list for item in sublist]
    print(len(all_runs))
    
    pval1 = stats.ttest_1samp(all_runs, original_runs[0], alternative='less')[1]
    pval2 = stats.ttest_1samp(all_runs, original_runs[1], alternative='less')[1]
    pval3 = stats.ttest_1samp(all_runs, original_runs[2], alternative='less')[1]

    return pval1, pval2, pval3

In [None]:
targets = ['PET', 'PETandDX']
randoms = ['Shuffled', 'Rewired']
diseases = ['AD', 'ND']

for target in targets:
    for random in randoms:
        print(f'{target} target - {random} graph datasets')
        for disease in diseases:
            original_values = results_dict[f'{disease}_{target}']
            pval1, pval2, pval3 = compute_pvalues(original_values, f'results/GNN_models_results/{target}/{target}_{disease}_{random}_results.csv')

            print(f'{disease} network')
            print('Original run #1 vs. 300 random runs:', pval1)
            print('Original run #2 vs. 300 random runs:', pval2)
            print('Original run #3 vs. 300 random runs:', pval3)
            print()