In [1]:
import pandas as pd
import json
from scipy.stats import ttest_ind
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [14]:
best_configs = pd.read_csv('best_configs.csv')
best_configs

Unnamed: 0,Target,Disease,Network,Pre-MLP,MP,Post-MLP,Connectivity,Aggregation,Best epoch,Mean,Std,bestepoch_file
0,PET,AD,PPI,1,2,2,skipsum,mean,179,0.7124,0.0166,/home/laura/GraphGym/run/results/PET_grid_miss...
1,PET,ND,PPI,1,2,2,skipconcat,mean,179,0.6738,0.0291,/home/laura/GraphGym/run/results/PET_grid_miss...
2,PETandDX,AD,PPI,2,2,2,skipsum,max,119,0.709,0.0198,/home/laura/GraphGym/run/results/PETandDX_grid...
3,PETandDX,ND,PPI,2,2,2,skipsum,add,159,0.7619,0.0903,/home/laura/GraphGym/run/results/PETandDX_grid...
4,PET,AD,biogrid,2,2,2,skipsum,max,99,0.6837,0.0264,/home/laura/GraphGym/run/results/PET_grid_miss...
5,PET,AD,huri,1,2,3,skipconcat,max,99,0.7003,0.0083,/home/laura/GraphGym/run/results/PET_grid_miss...
6,PET,AD,snap_brain,2,2,2,skipsum,max,159,0.7092,0.0065,/home/laura/GraphGym/run/results/PET_grid_miss...
7,PET,AD,giant_brain,2,2,2,skipconcat,max,39,0.6993,0.015,/home/laura/GraphGym/run/results/PET_grid_miss...
8,PETandDX,AD,biogrid,2,2,2,skipconcat,max,199,0.7037,0.0163,/home/laura/GraphGym/run/results/PETandDX_grid...
9,PETandDX,AD,huri,2,2,3,skipsum,add,59,0.7804,0.0487,/home/laura/GraphGym/run/results/PETandDX_grid...


### Extract random runs from each graph-classification task

In [3]:
def get_random_runs(df, directory, target, disease, network, cfg):
    
    exp_results = []
    for i in range(1, 101):

        # Get the epoch that gives best values
        rand = df.loc[df['dataset'] == f'{disease}_PPI_rand{i}_missense']
        epoch = int(rand['epoch'].values)

        run_results = []
        run_results.append(i)
        run_results.append(epoch)

        # Obtain the values from the three runs
        for j in range(1, 4):
            run_file = f'{directory}/{target}_random-format=custom_split-dataset={disease}_PPI_rand{i}_missense-task=graph-trans=False-feature=-label=-l_pre={cfg[0]}-l_mp={cfg[1]}-l_post={cfg[2]}-stage={cfg[3]}-agg={cfg[4]}/{j}/test/stats.json'
            run = pd.read_json(run_file, lines=True)
            best = run.loc[run['epoch'] == epoch]
            auc = float(best['auc'])
            run_results.append(auc)

        exp_results.append(run_results)

    result = pd.DataFrame(exp_results, columns = ['random', 'epoch', '1', '2', '3'])
    return result

### PET target, AD PPI random networks

In [4]:
# Get configuration parameters of the original one
target = 'PET'
disease = 'AD'
network = 'PPI'

shuffled = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/agg/test_bestepoch.csv')
rewired = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/agg/test_bestepoch.csv')

cfg = best_configs.loc[(best_configs['Target']== target) & (best_configs['Disease']==disease) & (best_configs['Network']==network)]
cfg = cfg[['Pre-MLP', 'MP', 'Post-MLP', 'Connectivity', 'Aggregation']].values.tolist()
cfg = [item for sublist in cfg for item in sublist]
print(cfg)

res_shuffled = get_random_runs(shuffled, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/', target, disease, network, cfg)
res_rewired = get_random_runs(rewired, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/', target, disease, network, cfg)

res_shuffled.to_csv(f'{target}_{disease}_{network}_SHUFFLED_results.csv', index=None)
res_rewired.to_csv(f'{target}_{disease}_{network}_REWIRED_results.csv', index=None)

[1, 2, 2, 'skipsum', 'mean']


### PET target, ND PPI random networks

In [5]:
# Get configuration parameters of the original one
target = 'PET'
disease = 'ND'
network = 'PPI'

shuffled = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/agg/test_bestepoch_cfgok.csv')
rewired = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/agg/test_bestepoch_cfgok.csv')

# print(shuffled)

cfg = best_configs.loc[(best_configs['Target']== target) & (best_configs['Disease']==disease) & (best_configs['Network']==network)]
cfg = cfg[['Pre-MLP', 'MP', 'Post-MLP', 'Connectivity', 'Aggregation']].values.tolist()
cfg = [item for sublist in cfg for item in sublist]
print(cfg)

res_shuffled = get_random_runs(shuffled, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/', target, disease, network, cfg)
res_rewired = get_random_runs(rewired, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/', target, disease, network, cfg)

res_shuffled.to_csv(f'{target}_{disease}_{network}_SHUFFLED_results.csv', index=None)
res_rewired.to_csv(f'{target}_{disease}_{network}_REWIRED_results.csv', index=None)

[1, 2, 2, 'skipconcat', 'mean']


### PET&DX target, AD PPI random networks

In [6]:
# Get configuration parameters of the original one
target = 'PETandDX'
disease = 'AD'
network = 'PPI'

shuffled = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/agg/test_bestepoch.csv')
rewired = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/agg/test_bestepoch.csv')

cfg = best_configs.loc[(best_configs['Target']== target) & (best_configs['Disease']==disease) & (best_configs['Network']==network)]
cfg = cfg[['Pre-MLP', 'MP', 'Post-MLP', 'Connectivity', 'Aggregation']].values.tolist()
cfg = [item for sublist in cfg for item in sublist]
print(cfg)

res_shuffled = get_random_runs(shuffled, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/', target, disease, network, cfg)
res_rewired = get_random_runs(rewired, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/', target, disease, network, cfg)

res_shuffled.to_csv(f'{target}_{disease}_{network}_SHUFFLED_results.csv', index=None)
res_rewired.to_csv(f'{target}_{disease}_{network}_REWIRED_results.csv', index=None)

[2, 2, 2, 'skipsum', 'max']


### PET&DX target, ND PPI random networks

In [7]:
# Get configuration parameters of the original one
target = 'PETandDX'
disease = 'ND'
network = 'PPI'

shuffled = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/agg/test_bestepoch.csv')
rewired = pd.read_csv(f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/agg/test_bestepoch.csv')

cfg = best_configs.loc[(best_configs['Target']== target) & (best_configs['Disease']==disease) & (best_configs['Network']==network)]
cfg = cfg[['Pre-MLP', 'MP', 'Post-MLP', 'Connectivity', 'Aggregation']].values.tolist()
cfg = [item for sublist in cfg for item in sublist]
print(cfg)

res_shuffled = get_random_runs(shuffled, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_SHUFFLED_100/', target, disease, network, cfg)
res_rewired = get_random_runs(rewired, f'/home/laura/GraphGym/run/results/{target}_{disease}_PPI_REWIRED_100_v1/', target, disease, network, cfg)

res_shuffled.to_csv(f'{target}_{disease}_{network}_SHUFFLED_results.csv', index=None)
res_rewired.to_csv(f'{target}_{disease}_{network}_REWIRED_results.csv', index=None)

[2, 2, 2, 'skipsum', 'add']


## Make statistical analysis
Compute a 1-sample t-test p-value that compares the 300 random runs (100 random networks x 3 runs) vs each original run, for each classification task.

In [2]:
# AUC values of each of the 3 runs
ad_ppi_pet = [0.6898, 0.7180, 0.7294]
nd_ppi_pet = [0.7050, 0.6349, 0.7143]
ad_ppi_petdx = [0.6825, 0.7302, 0.7143]
nd_ppi_petdx = [0.7937, 0.8532, 0.6389]

In [8]:
def compute_pvalues(original_runs, data):
    
    data.drop(columns=['epoch'], inplace=True)

    run1 = data['1'].values.tolist()
    run2 = data['2'].values.tolist()
    run3 = data['3'].values.tolist()

    all_list = data.values.tolist()
    all_runs = [item for sublist in all_list for item in sublist]
    print(len(all_runs))
    
    pval1 = stats.ttest_1samp(all_runs, original_runs[0], alternative='less')[1]
    pval2 = stats.ttest_1samp(all_runs, original_runs[1], alternative='less')[1]
    pval3 = stats.ttest_1samp(all_runs, original_runs[2], alternative='less')[1]
    
    return pval1, pval2, pval3

### PET target, AD PPI random networks

In [9]:
target = 'PET'
disease = 'AD'
network = 'PPI'

infile_shuffled = f'{target}_{disease}_{network}_SHUFFLED_results.csv'
infile_rewired = f'{target}_{disease}_{network}_REWIRED_results.csv'

shuffled_data = pd.read_csv(infile_shuffled, index_col='random')
rewired_data = pd.read_csv(infile_rewired, index_col='random')

pval1_shuffled, pval2_shuffled, pval3_shuffled = compute_pvalues(ad_ppi_pet, shuffled_data)
pval1_rewired, pval2_rewired, pval3_rewired = compute_pvalues(ad_ppi_pet, rewired_data)

print()

print('SHUFFLED p-values')
print(pval1_shuffled)
print(pval2_shuffled)
print(pval3_shuffled)
print()

print('REWIRED p-values')
print(pval1_rewired)
print(pval2_rewired)
print(pval3_rewired)

300
300

SHUFFLED p-values
8.329337506252698e-32
1.328803149681073e-68
1.1458347914587411e-82

SHUFFLED p-values
1.6815600981318664e-20
8.319559996451436e-74
8.200674098467878e-94


### PET target, ND PPI random networks

In [12]:
target = 'PET'
disease = 'ND'
network = 'PPI'

infile_shuffled = f'{target}_{disease}_{network}_SHUFFLED_results.csv'
infile_rewired = f'{target}_{disease}_{network}_REWIRED_results.csv'

shuffled_data = pd.read_csv(infile_shuffled, index_col='random')
rewired_data = pd.read_csv(infile_rewired, index_col='random')

pval1_shuffled, pval2_shuffled, pval3_shuffled = compute_pvalues(nd_ppi_pet, shuffled_data)
pval1_rewired, pval2_rewired, pval3_rewired = compute_pvalues(nd_ppi_pet, rewired_data)

print()

print('SHUFFLED p-values')
print(pval1_shuffled)
print(pval2_shuffled)
print(pval3_shuffled)
print()

print('REWIRED p-values')
print(pval1_rewired)
print(pval2_rewired)
print(pval3_rewired)

300
300

SHUFFLED p-values
2.610391236895768e-106
6.094880669927345e-37
4.471070614893043e-114

SHUFFLED p-values
3.382962662926563e-62
0.4388383292737576
2.282646260315474e-72


### PET&DX target, AD PPI random networks

In [11]:
target = 'PETandDX'
disease = 'AD'
network = 'PPI'

infile_shuffled = f'{target}_{disease}_{network}_SHUFFLED_results.csv'
infile_rewired = f'{target}_{disease}_{network}_REWIRED_results.csv'

shuffled_data = pd.read_csv(infile_shuffled, index_col='random')
rewired_data = pd.read_csv(infile_rewired, index_col='random')

pval1_shuffled, pval2_shuffled, pval3_shuffled = compute_pvalues(ad_ppi_petdx, shuffled_data)
pval1_rewired, pval2_rewired, pval3_rewired = compute_pvalues(ad_ppi_petdx, rewired_data)

print()

print('SHUFFLED p-values')
print(pval1_shuffled)
print(pval2_shuffled)
print(pval3_shuffled)
print()

print('REWIRED p-values')
print(pval1_rewired)
print(pval2_rewired)
print(pval3_rewired)

300
300

SHUFFLED p-values
0.9999999999999203
0.060056008409483747
0.9375864082038932

SHUFFLED p-values
1.0
0.0721644534952392
0.982547375311063


### PET&DX target, ND PPI random networks

In [10]:
target = 'PETandDX'
disease = 'ND'
network = 'PPI'

infile_shuffled = f'{target}_{disease}_{network}_SHUFFLED_results.csv'
infile_rewired = f'{target}_{disease}_{network}_REWIRED_results.csv'

shuffled_data = pd.read_csv(infile_shuffled, index_col='random')
rewired_data = pd.read_csv(infile_rewired, index_col='random')

pval1_shuffled, pval2_shuffled, pval3_shuffled = compute_pvalues(nd_ppi_petdx, shuffled_data)
pval1_rewired, pval2_rewired, pval3_rewired = compute_pvalues(nd_ppi_petdx, rewired_data)

print()

print('SHUFFLED p-values')
print(pval1_shuffled)
print(pval2_shuffled)
print(pval3_shuffled)
print()

print('REWIRED p-values')
print(pval1_rewired)
print(pval2_rewired)
print(pval3_rewired)

300
300

SHUFFLED p-values
1.0536663884788722e-85
3.370804425067958e-120
0.9111582432928029

SHUFFLED p-values
8.58652809150937e-28
1.2147703520584758e-76
1.0
