In [2]:
#Load libraries 
import pandas as pd
import sspa
import scipy
import numpy as np 
import os #get path location
import pickle

We make two different networks, one for the COVID cases 1-2 compared to COVID cases 3-7 <br>
This is because there are only 18 samples in common between the metabolomic and proteomic datasets

0       Common samples: 18           Metabolomic samples: 133        Proteomic samples: 123 <br>
1-2       Common samples: 45          Metabolomic samples: 45        Proteomic samples: 48 <br>
3-4       Common samples: 56          Metabolomic samples: 57        Proteomic samples: 59 <br>
5-7       Common samples: 27          Metabolomic samples: 28        Proteomic samples: 28 <br>

146 common samples overall,   128 cases (45 samples (WHO 1-2) vs 83 samples (WHO 3-7))

In [2]:
#Load dataset
df = pd.read_csv('Data/Su_COVID_metabolomics_processed.csv', index_col=0)
df.index= df.index.str.rstrip('-BL')

df2 = pd.read_csv('Data/Su_COVID_proteomics_processed.csv', index_col=0)
intersection = list(set(df.index.tolist()) & set(df2.index.tolist())) #set removes duplicates
intersection = [sample for sample in intersection if sample.startswith("INCOV")]
print(len(intersection))

df = df[df.index.isin(intersection)]

df_mild = (df[df["WHO_status"] == '1-2']).iloc[:,:-2] #45 samples, remove the metadata
df_severe = (df[(df["WHO_status"] == '3-4') | (df["WHO_status"] == '5-7')]).iloc[:,:-2] #83 samples

128


In [3]:
#Download the reactome pathways
reactome_pathways = sspa.process_gmt("Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

#Download the root pathways
root_path = pd.read_excel('Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())



### Step 1: Determine initial test-statistic

In [4]:
#Function to calculate the squared Spearman correlation matrix 

def squared_spearman_corr(data):
    kpca_scores = sspa.sspa_kpca(data, reactome_pathways)   
    kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns))) #using Sara's code to drop root pathways

    spearman_results = scipy.stats.spearmanr(kpca_scores)
    squared_spearman_coef = np.square(spearman_results[0]) #correlation coefficients (spearman_results[1] gives the p-values)

    return squared_spearman_coef,list(kpca_scores.columns)




#Function to calculate the absolute difference between two matrices and then determine the mean for each edge

def absolute_val(data1,data2,edgelist):
    abs_rho_squared = np.absolute(np.array(data1) - np.array(data2))

    #Mask the upper half of the dataframe (so I don't view the comparisons between the two same genes, and also the duplicate comparisons are removed)
    mask = abs_rho_squared.copy()
    mask = np.triu(np.ones(mask.shape)).astype(bool)
    mask = np.invert(mask) #invert true and false values so the diagonal is False as well
    non_dup_abs_squared = pd.DataFrame(abs_rho_squared, columns = edgelist, index = edgelist)
    non_dup_abs_squared = pd.DataFrame(non_dup_abs_squared).where(mask) #Replace all false values with NaN using mask

    abs_squared_list = non_dup_abs_squared.stack().reset_index()
    abs_squared_list['level_0'] = abs_squared_list["level_0"].astype(str) + ", " + abs_squared_list['level_1']
    abs_squared_list.columns = ["Edges","na","Initial_tstat"]
    abs_squared_list.index = abs_squared_list["Edges"]
    abs_squared_list = abs_squared_list.drop(columns = ["Edges","na"])

    return(abs_squared_list)

Note: For the delta squared correlation values for the unshuffled data (i.e. the real data) I keep the indices (pathway edges). Since I already have a record of the edges, there is no need to keep the edges for each permutation, since the order is the same each time. 

In [5]:
spearman_mild,edgelist = squared_spearman_corr(df_mild)
spearman_severe,edgelist = squared_spearman_corr(df_severe)

output = absolute_val(spearman_mild,spearman_severe,edgelist)

Note: There are no values with a Spearman correlation value of zero. The reason why some initial test statistics have a value of zero is because for both groups both values are zero.

In [13]:
test_df = pd.DataFrame(spearman_mild, columns = edgelist, index = edgelist)
display(test_df.iloc[:25,:25])

Unnamed: 0,R-HSA-110331,R-HSA-112310,R-HSA-112311,R-HSA-112315,R-HSA-1237112,R-HSA-1368082,R-HSA-1368108,R-HSA-1428517,R-HSA-1483148,R-HSA-1483206,...,R-HSA-15869,R-HSA-1592230,R-HSA-159418,R-HSA-1614603,R-HSA-1614635,R-HSA-163685,R-HSA-1655829,R-HSA-1660661,R-HSA-168249,R-HSA-189200
R-HSA-110331,1.0,0.007222,0.006043,0.020738,0.378411,0.021004,0.021004,0.0749562,0.0005624209,0.001155,...,0.78529,0.021004,0.056305,0.013843,0.049168,0.019726,0.021004,0.037154,0.001722,0.029562
R-HSA-112310,0.007222,1.0,0.063791,0.809052,0.018884,0.715462,0.715462,7.655173e-06,0.3585785,0.35716,...,0.009974,0.715462,0.022441,0.119794,0.000202,0.516732,0.715462,0.016333,0.006651,0.01524
R-HSA-112311,0.006043,0.063791,1.0,0.075607,0.061418,0.004016,0.004016,9.764252e-05,0.02299678,0.045219,...,0.04219,0.004016,0.006673,0.066617,0.215448,0.063592,0.004016,0.017848,0.003225,0.002736
R-HSA-112315,0.020738,0.809052,0.075607,1.0,0.000157,0.522811,0.522811,0.002078111,0.2709758,0.288252,...,0.019504,0.522811,0.005463,0.086943,3.7e-05,0.333776,0.522811,0.016031,0.001744,0.022048
R-HSA-1237112,0.378411,0.018884,0.061418,0.000157,1.0,0.00023,0.00023,0.01855913,0.01488464,0.026006,...,0.214837,0.00023,0.018416,0.000202,0.239571,0.139122,0.00023,0.002667,0.00315,0.017883
R-HSA-1368082,0.021004,0.715462,0.004016,0.522811,0.00023,1.0,1.0,0.001924886,0.2767675,0.269332,...,0.03958,1.0,0.062467,0.060637,0.003999,0.371152,1.0,0.00434,0.000674,0.000177
R-HSA-1368108,0.021004,0.715462,0.004016,0.522811,0.00023,1.0,1.0,0.001924886,0.2767675,0.269332,...,0.03958,1.0,0.062467,0.060637,0.003999,0.371152,1.0,0.00434,0.000674,0.000177
R-HSA-1428517,0.074956,8e-06,9.8e-05,0.002078,0.018559,0.001925,0.001925,1.0,6.249121e-07,0.000934,...,0.095293,0.001925,0.014981,0.270016,0.01524,0.002389,0.001925,0.020813,0.007022,0.008751
R-HSA-1483148,0.000562,0.358578,0.022997,0.270976,0.014885,0.276767,0.276767,6.249121e-07,1.0,0.86669,...,0.000744,0.276767,0.000108,0.0056,0.026177,0.123841,0.276767,4.5e-05,0.007088,0.01067
R-HSA-1483206,0.001155,0.35716,0.045219,0.288252,0.026006,0.269332,0.269332,0.0009343131,0.8666899,1.0,...,0.01,0.269332,0.003106,0.009074,0.039109,0.152707,0.269332,0.010642,7.1e-05,0.003721


In [11]:
output

Unnamed: 0_level_0,Initial_tstat
Edges,Unnamed: 1_level_1
R-HSA-112310/R-HSA-110331,0.029745
R-HSA-112311/R-HSA-110331,0.102846
R-HSA-112311/R-HSA-112310,0.330878
R-HSA-112315/R-HSA-110331,0.033879
R-HSA-112315/R-HSA-112310,0.115238
...,...
R-HSA-975634/R-HSA-9717207,0.040248
R-HSA-975634/R-HSA-9734207,0.008203
R-HSA-975634/R-HSA-9735804,0.002968
R-HSA-975634/R-HSA-9749641,0.008119


In [None]:
output.to_csv("Data/permutation_test_files_metabolomics/initial_tstats.csv")

### Step 2: Shuffle the labels

The sample labels are shuffled, rather than assigning the samples to two different groups (since the sizes of the 1-2 class with the 3-7 class is not equal). See HPC_permutation_script_metabolomic for more info.

### Step 3: Read in the permutation files 

Using the HPC, 10 files each store 10k permutations. 10 array jobs are carried out to read in all 10k values, make them absolute and count how many are above the initial test statistic. See permutation_distribution.ipynb for more info.

https://www.jwilber.me/permutationtest/# <br>
We take the absolute value for the absolute difference between the two groups to compute the initial test statistic. Each permutation calculates a new test statistic calculated from the ABSOLUTE difference between the two test groups. It makes no difference whether I take the absolute difference or regular difference between the two, since either way it's a two-tailed p-value distribution, just that with the absolute difference the negative values have been mapped to the positive side.

### Step 4: Compare the difference in edges with other networks

Comparing with the naive networks (mild vs severe) to see if they agree with the differential network results:

In [4]:
import networkx as nx

In [5]:
mild = nx.read_gml("Cytoscape/metabolomic_final_mildcases.gml")
severe = nx.read_gml("Cytoscape/metabolomic_final_severecases.gml")

In [14]:
mild_edges = list(mild.edges())
print(len(mild_edges))
severe_edges = list(severe.edges())
print(len(severe_edges))

144
162


In [16]:
#Example code from https://stackoverflow.com/questions/41125909/python-find-elements-in-one-list-that-are-not-in-the-other
list_1=["a", "b", "c", "d", "e"]
list_2=["a", "f", "c", "m"]
set(list_2) - set(list_1)

#set(['m', 'f'])

{'f', 'm'}

In [12]:
#Edges present in the mild network but not the severe network
len(set(mild_edges) - set(severe_edges))

98

In [13]:
#Edges present in the severe network but not the mild network
len(set(severe_edges) - set(mild_edges))

116

In [1]:
#Reading in the edges expressed in the differential metabolomic network
with open('Data/permutation_test_files_metabolomics/sig_edges_beforefdr.txt') as f:
    lines = f.readlines()

edges_remaining = []

edges = lines[0].split(",")

for index in range(0,len(edges),2):
    list1 = edges[index],(edges[index+1][1:])
    edges_remaining.append(tuple(list1))

print(len(edges_remaining))
edges_remaining

795


[('R-HSA-1368082', 'R-HSA-1237112'),
 ('R-HSA-1368108', 'R-HSA-1237112'),
 ('R-HSA-1483206', 'R-HSA-1368082'),
 ('R-HSA-1483206', 'R-HSA-1368108'),
 ('R-HSA-1483255', 'R-HSA-1483206'),
 ('R-HSA-1483257', 'R-HSA-112310'),
 ('R-HSA-1483257', 'R-HSA-1368082'),
 ('R-HSA-1483257', 'R-HSA-1368108'),
 ('R-HSA-1483257', 'R-HSA-1483148'),
 ('R-HSA-156580', 'R-HSA-110331'),
 ('R-HSA-156582', 'R-HSA-1237112'),
 ('R-HSA-156582', 'R-HSA-1483206'),
 ('R-HSA-156582', 'R-HSA-1483257'),
 ('R-HSA-156584', 'R-HSA-156580'),
 ('R-HSA-15869', 'R-HSA-156584'),
 ('R-HSA-1592230', 'R-HSA-1237112'),
 ('R-HSA-1592230', 'R-HSA-1483206'),
 ('R-HSA-1592230', 'R-HSA-1483257'),
 ('R-HSA-1614603', 'R-HSA-1428517'),
 ('R-HSA-1655829', 'R-HSA-1237112'),
 ('R-HSA-1655829', 'R-HSA-1483206'),
 ('R-HSA-1655829', 'R-HSA-1483257'),
 ('R-HSA-168249', 'R-HSA-156582'),
 ('R-HSA-192456', 'R-HSA-1428517'),
 ('R-HSA-192456', 'R-HSA-1483206'),
 ('R-HSA-192456', 'R-HSA-1483257'),
 ('R-HSA-196071', 'R-HSA-110331'),
 ('R-HSA-196071', '

In [21]:
intersection = list(set(edges_remaining).intersection(list(set(mild_edges))))  
len(intersection) #30 edges out of 98

30

In [22]:
intersection = list(set(edges_remaining).intersection(list(set(severe_edges)))) 
len(intersection) #6 edges out of 116

6