In [7]:
#Load libraries 
import pandas as pd
from sklearn.preprocessing import StandardScaler
import sspa
import scipy
import numpy as np 

We make two different networks, one for the COVID cases 1-2 compared to COVID cases 3-7 <br>
This is because there are only 18 samples in common between the metabolomic and proteomic datasets

0       Common samples: 18           Metabolomic samples: 133        Proteomic samples: 123 <br>
1-2       Common samples: 45          Metabolomic samples: 45        Proteomic samples: 48 <br>
3-4       Common samples: 56          Metabolomic samples: 57        Proteomic samples: 59 <br>
5-7       Common samples: 27          Metabolomic samples: 28        Proteomic samples: 28 <br>

146 common samples overall,   128 cases (45 samples (WHO 1-2) vs 83 samples (WHO 3-7))

In [8]:
df = pd.read_csv('Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
df_mild = (df[df["WHO_status"] == '1-2']).iloc[:,:-2] #45 samples, remove the metadata
df_severe = (df[(df["WHO_status"] == '3-4') | (df["WHO_status"] == '5-7')]).iloc[:,:-2] #83 samples

### Scale data after subsetting

In [11]:
df_mild = pd.DataFrame(StandardScaler().fit_transform(df_mild),columns=df_mild.columns, index=df_mild.index)
df_severe = pd.DataFrame(StandardScaler().fit_transform(df_severe),columns=df_severe.columns, index=df_severe.index)

Check that data is centred at zero (mean=zero and standard deviation=1 for each analyte):

In [4]:
def check_centred(type):
    print(type.max().max())
    print(type.min().min())
    print(type.mean(axis = 0)) #mean of 0
    print(type.std(axis = 0)) #sd of 1

In [5]:
check_centred(df_mild)

6.6209477232227725
-4.68806063251296
1372    -9.868649e-18
16610    2.467162e-17
72665    0.000000e+00
27823   -3.947460e-17
30915    7.401487e-18
             ...     
28238    3.947460e-17
76341   -7.956598e-17
89312   -1.480297e-17
17861    8.881784e-17
89188    3.454027e-17
Length: 333, dtype: float64
1372     1.0113
16610    1.0113
72665    1.0113
27823    1.0113
30915    1.0113
          ...  
28238    1.0113
76341    1.0113
89312    1.0113
17861    1.0113
89188    1.0113
Length: 333, dtype: float64


In [6]:
check_centred(df_severe)

8.76786459399713
-5.725380981027679
1372    -3.210283e-17
16610   -7.490661e-17
72665    8.025709e-18
27823    6.153043e-17
30915   -4.815425e-17
             ...     
28238   -1.404499e-17
76341   -2.407713e-17
89312    4.932467e-17
17861   -6.688091e-19
89188    3.176843e-18
Length: 333, dtype: float64
1372     1.006079
16610    1.006079
72665    1.006079
27823    1.006079
30915    1.006079
           ...   
28238    1.006079
76341    1.006079
89312    1.006079
17861    1.006079
89188    1.006079
Length: 333, dtype: float64


In [7]:
df_mild

Unnamed: 0_level_0,1372,16610,72665,27823,30915,37373,16831,545959,17050,16359,...,64032,133677,133693,133694,28036,28238,76341,89312,17861,89188
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INCOV039,0.407294,2.526194,0.024981,0.21804,-0.18117,-1.035391,-0.518136,-0.194837,0.099676,-0.486964,...,-0.523059,-0.553119,-0.801574,-1.11367,1.328434,1.957696,0.264218,-0.469465,0.072754,-0.437016
INCOV042,1.429706,-1.657584,0.194817,-0.812338,-0.084932,-0.731967,-1.346616,-0.797464,-1.868052,-0.169584,...,-0.812267,0.095104,0.27997,0.008015,-0.419346,-0.892632,-0.603241,-0.504751,1.153761,-1.406727
INCOV056,-1.778223,3.173371,1.707373,-0.468166,0.571892,-0.132435,0.156358,0.781962,2.555767,-0.355751,...,0.681489,-0.975671,-1.013664,-1.427257,0.074581,-0.634625,-0.108644,0.099199,-0.439856,1.836353
INCOV057,1.252397,1.354213,-0.515353,0.350064,-1.921341,-1.491206,-0.216508,0.242145,0.411317,-0.077096,...,-0.267462,0.145149,-0.113996,-0.467826,-0.322626,-0.074316,-0.325932,-0.903624,-0.487578,-0.392106
INCOV058,-1.213282,-0.497386,-0.296312,-0.92304,2.149032,0.64894,4.897828,-0.930717,-0.278801,-0.021439,...,0.7417,-1.333916,-1.373171,-0.650641,2.887196,-1.36339,-0.132697,0.48182,-1.427262,1.555612
INCOV060,0.072084,1.621323,-1.030757,-0.569382,-0.500373,-1.183697,-0.14953,4.899405,2.28968,-0.411854,...,-0.864529,0.473546,2.439046,3.165209,-0.384928,0.403392,-0.645436,-0.429454,-1.058278,0.657456
INCOV061,0.805359,-0.582933,0.752317,-0.696767,-1.742341,0.266384,0.540789,-0.341194,-0.258608,-0.419311,...,-0.631529,-0.082527,-0.250284,-0.05998,0.215674,-0.052186,0.21907,0.149988,-0.921349,0.16956
INCOV063,-0.616898,-0.565445,-0.260833,0.257322,-0.847523,-0.922695,1.383093,0.162799,-1.290761,-0.156144,...,-0.267652,2.764172,2.0895,2.208369,0.948492,-0.311763,1.118836,-0.059689,-0.946847,-0.224965
INCOV066,0.661576,0.148924,1.356975,0.622114,-1.302914,0.518395,-0.096466,-0.717683,1.330898,-0.378776,...,-0.777969,-0.292197,-0.168338,-0.393532,-0.160034,1.355416,1.076344,-0.793559,0.040921,-1.398804
INCOV067,-0.435942,1.243908,0.897895,-0.818793,-0.774587,-1.196931,-0.194121,-0.804576,-0.030912,-0.259604,...,-0.622895,0.676235,0.900528,1.323675,-0.289411,0.8085,-0.392882,2.839094,0.273483,-0.067373


In [8]:
df_severe

Unnamed: 0_level_0,1372,16610,72665,27823,30915,37373,16831,545959,17050,16359,...,64032,133677,133693,133694,28036,28238,76341,89312,17861,89188
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INCOV001,-0.231154,-0.725943,-0.582251,-0.251118,-0.068339,-0.793204,-0.313117,0.448729,-0.635039,-0.382799,...,-1.023823,-0.343004,-0.359139,0.134796,-0.362407,0.616660,-0.039671,1.621249,-0.611502,-0.346518
INCOV002,-0.019021,-0.909481,0.710785,-0.679395,-1.083827,0.020233,-1.115562,-0.139103,-0.642881,-0.492701,...,-0.195545,0.826404,0.561248,0.686429,0.115848,0.019932,-0.190632,-0.931902,1.066632,0.782843
INCOV003,-0.736514,-0.274410,4.432358,-0.359594,-1.095534,1.188597,0.253397,-0.035723,-0.463012,-0.089660,...,-0.038311,0.891763,2.022093,1.738393,-0.425408,-1.174113,-0.806421,2.166995,-1.678124,0.657367
INCOV004,-0.446749,-1.506985,-1.245852,-1.371771,-0.108172,-0.543166,0.056119,-0.560944,-1.282841,-0.522401,...,-1.200132,0.728779,0.829129,0.049010,-0.485138,-0.496982,1.417509,2.101582,-1.597753,-0.162225
INCOV005,-0.621744,0.579081,-0.591145,-0.546702,0.395178,0.255822,-0.401583,-0.752622,-1.036320,-0.295656,...,-1.092711,-0.518897,-0.094272,-0.372235,-0.422040,0.785316,-0.556301,-0.353727,-1.039778,-0.579523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
INCOV133,0.207921,1.175036,-0.403147,0.119934,-0.101852,0.282620,0.342890,0.243646,0.895894,-0.512162,...,0.782725,0.546436,0.017943,0.642776,-0.663812,-0.657291,-0.105943,-0.389702,0.904204,-0.488136
INCOV136,-0.066952,1.414012,0.307681,0.390193,-0.800148,0.443736,-0.092301,0.677691,1.748565,0.273361,...,-0.387935,-0.513359,-0.619791,-0.624568,-0.497752,-1.147155,0.171224,-0.496692,0.655913,-0.258550
INCOV137,0.372596,0.147821,-0.846153,1.266961,-1.631333,-0.122781,-0.042050,-0.736404,0.107238,-0.552894,...,-1.200687,-1.394631,-1.152810,-1.122083,1.375931,1.532699,0.226475,-0.405351,-0.116984,-0.722303
INCOV139,0.859476,1.672245,1.970187,2.740874,-0.546321,-0.527090,0.259036,0.365804,2.319742,-0.404460,...,-0.224347,-1.018720,-0.911858,-0.796828,0.675603,0.501508,-0.569729,0.981215,-1.833167,0.873879


In [9]:
#Download the reactome pathways
reactome_pathways = sspa.process_gmt("Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

#Download the root pathways
root_path = pd.read_excel('Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())



### Step 1: Determine initial test-statistic

In [10]:
#Function to calculate the squared Spearman correlation matrix 

def squared_spearman_corr(data):
    kpca_scores = sspa.sspa_kpca(data, reactome_pathways)   
    kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns))) #using Sara's code to drop root pathways

    spearman_results = scipy.stats.spearmanr(kpca_scores)
    squared_spearman_coef = np.square(spearman_results[0]) #correlation coefficients (spearman_results[1] gives the p-values)

    return squared_spearman_coef,list(kpca_scores.columns)




#Function to calculate the difference between two matrices 

def initial_tstat(data1,data2,edgelist):
    #abs_rho_squared = np.absolute(np.array(data1) - np.array(data2))
    delta_squared = (np.array(data1) - np.array(data2))

    #Mask the upper half of the dataframe (so I don't view the comparisons between the two same genes, and also the duplicate comparisons are removed)
    mask = delta_squared.copy()
    mask = np.triu(np.ones(mask.shape)).astype(bool)
    mask = np.invert(mask) #invert true and false values so the diagonal is False as well
    non_dup_delta_squared = pd.DataFrame(delta_squared, columns = edgelist, index = edgelist)
    non_dup_delta_squared = pd.DataFrame(non_dup_delta_squared).where(mask) #Replace all false values with NaN using mask

    squared_list = non_dup_delta_squared.stack().reset_index()
    squared_list['level_0'] = squared_list["level_0"].astype(str) + ", " + squared_list['level_1']
    squared_list.columns = ["Edges","na","Initial_tstat"]
    squared_list.index = squared_list["Edges"]
    squared_list = squared_list.drop(columns = ["Edges","na"])

    return(squared_list)

Note: For the delta squared correlation values for the unshuffled data (i.e. the real data) I keep the indices (pathway edges). Since I already have a record of the edges, there is no need to keep the edges for each permutation, since the order is the same each time. 

In [11]:
spearman_mild,edgelist = squared_spearman_corr(df_mild)
spearman_severe,edgelist = squared_spearman_corr(df_severe)

output = initial_tstat(spearman_mild,spearman_severe,edgelist)

Note: There are no values with a Spearman correlation value of zero. The reason why some initial test statistics have a value of zero is because for both groups both values are one.

In [12]:
test_df = pd.DataFrame(spearman_mild, columns = edgelist, index = edgelist)
display(test_df.iloc[:25,:25])

Unnamed: 0,R-HSA-110331,R-HSA-112310,R-HSA-112311,R-HSA-112315,R-HSA-1237112,R-HSA-1368082,R-HSA-1368108,R-HSA-1428517,R-HSA-1483148,R-HSA-1483206,...,R-HSA-15869,R-HSA-1592230,R-HSA-159418,R-HSA-1614603,R-HSA-1614635,R-HSA-163685,R-HSA-1655829,R-HSA-1660661,R-HSA-168249,R-HSA-189200
R-HSA-110331,1.0,0.004586,0.004568,0.02499648,0.1976089,0.006249,0.006249,0.087488,0.003737,0.003018,...,0.628041,0.006249,0.065533,0.001701,0.037002,0.027253,0.006249,0.066753,0.000331,0.002533
R-HSA-112310,0.004586,1.0,0.016064,0.878011,0.002681029,0.714571,0.714571,0.000983,0.060248,0.032391,...,0.019504,0.714571,0.025625,0.118884,0.035348,0.593854,0.714571,0.017708,0.02417,0.07539
R-HSA-112311,0.004568,0.016064,1.0,0.02108044,0.02716634,0.000117,0.000117,0.000744,0.758437,0.500384,...,0.000117,0.000117,0.005739,0.00209,0.068397,0.009609,0.000117,0.013843,0.858125,0.042515
R-HSA-112315,0.024996,0.878011,0.02108,1.0,4.339668e-07,0.632224,0.632224,0.001322,0.068535,0.01691,...,0.047488,0.632224,0.010372,0.086323,0.017428,0.486869,0.632224,0.035945,0.030475,0.033106
R-HSA-1237112,0.197609,0.002681,0.027166,4.339668e-07,1.0,0.001552,0.001552,0.004134,0.022837,0.072169,...,0.052374,0.001552,0.030383,0.012571,0.657616,0.027123,0.001552,0.002961,0.016366,0.003753
R-HSA-1368082,0.006249,0.714571,0.000117,0.632224,0.001551882,1.0,1.0,0.005347,0.017568,0.064191,...,0.054691,1.0,0.039895,0.063392,0.001294,0.449907,1.0,0.000918,0.001228,0.048412
R-HSA-1368108,0.006249,0.714571,0.000117,0.632224,0.001551882,1.0,1.0,0.005347,0.017568,0.064191,...,0.054691,1.0,0.039895,0.063392,0.001294,0.449907,1.0,0.000918,0.001228,0.048412
R-HSA-1428517,0.087488,0.000983,0.000744,0.001322314,0.004133863,0.005347,0.005347,1.0,0.012395,0.001995,...,0.012219,0.005347,0.012277,0.166064,0.000773,0.007516,0.005347,0.013781,0.014788,0.017813
R-HSA-1483148,0.003737,0.060248,0.758437,0.06853496,0.02283722,0.017568,0.017568,0.012395,1.0,0.365874,...,0.00333,0.017568,0.016131,0.011901,0.037256,0.0207,0.017568,0.023722,0.78086,0.023318
R-HSA-1483206,0.003018,0.032391,0.500384,0.01691028,0.07216904,0.064191,0.064191,0.001995,0.365874,1.0,...,5.1e-05,0.064191,0.000848,0.004409,0.191796,0.007932,0.064191,0.097255,0.372438,0.007425


In [13]:
output

Unnamed: 0_level_0,Initial_tstat
Edges,Unnamed: 1_level_1
"R-HSA-112310, R-HSA-110331",0.001773
"R-HSA-112311, R-HSA-110331",-0.038910
"R-HSA-112311, R-HSA-112310",-0.144006
"R-HSA-112315, R-HSA-110331",0.014812
"R-HSA-112315, R-HSA-112310",-0.027926
...,...
"R-HSA-975634, R-HSA-9717207",0.067177
"R-HSA-975634, R-HSA-9734207",0.023737
"R-HSA-975634, R-HSA-9735804",0.002574
"R-HSA-975634, R-HSA-9749641",0.015996


In [15]:
output.to_csv("Data/permutation_test_files_metabolomics/initial_tstats.csv")

Calculating pathway scores without pathway analysis:

In [None]:
#Function to calculate the squared Spearman correlation matrix 

def squared_spearman_corr(data):
    spearman_results = scipy.stats.spearmanr(data)
    squared_spearman_coef = np.square(spearman_results[0]) #correlation coefficients (spearman_results[1] gives the p-values)

    return squared_spearman_coef,list(data.columns)




#Function to calculate the absolute difference between two matrices and then determine the mean for each edge

def absolute_val(data1,data2,edgelist):
    abs_rho_squared = np.absolute(np.array(data1) - np.array(data2))

    #Mask the upper half of the dataframe (so I don't view the comparisons between the two same genes, and also the duplicate comparisons are removed)
    mask = abs_rho_squared.copy()
    mask = np.triu(np.ones(mask.shape)).astype(bool)
    mask = np.invert(mask) #invert true and false values so the diagonal is False as well
    non_dup_abs_squared = pd.DataFrame(abs_rho_squared, columns = edgelist, index = edgelist)
    non_dup_abs_squared = pd.DataFrame(non_dup_abs_squared).where(mask) #Replace all false values with NaN using mask

    abs_squared_list = non_dup_abs_squared.stack().reset_index()
    abs_squared_list['level_0'] = abs_squared_list["level_0"].astype(str) + ", " + abs_squared_list['level_1']
    abs_squared_list.columns = ["Edges","na","Initial_tstat"]
    abs_squared_list.index = abs_squared_list["Edges"]
    abs_squared_list = abs_squared_list.drop(columns = ["Edges","na"])

    return(abs_squared_list)

In [None]:
spearman_mild,edgelist = squared_spearman_corr(df_mild)
spearman_severe,edgelist = squared_spearman_corr(df_severe)

output = absolute_val(spearman_mild,spearman_severe,edgelist)

In [None]:
#output

In [None]:
#output.to_csv("Data/permutation_test_files_metabolomics_withoutPA/initial_tstats.csv")

### Step 2: Shuffle the labels

The sample labels are shuffled, rather than assigning the samples to two different groups (since the sizes of the 1-2 class with the 3-7 class is not equal). See HPC_permutation_script_metabolomic for more info.

### Step 3: Read in the permutation files 

Using the HPC, 10 files each store 10k permutations. 10 array jobs are carried out to read in all 10k values, make them absolute and count how many are above the initial test statistic. See permutation_distribution.ipynb for more info.

https://www.jwilber.me/permutationtest/# <br>
We take the absolute value for the absolute difference between the two groups to compute the initial test statistic. Each permutation calculates a new test statistic calculated from the ABSOLUTE difference between the two test groups. It makes no difference whether I take the absolute difference or regular difference between the two, since either way it's a two-tailed p-value distribution, just that with the absolute difference the negative values have been mapped to the positive side.

### Step 4: Compare the difference in edges with other networks

Comparing with the naive networks (mild vs severe) to see if they agree with the differential network results:

In [None]:
import networkx as nx

In [None]:
mild = nx.read_gml("Cytoscape/metabolomic_final_mildcases.gml")
severe = nx.read_gml("Cytoscape/metabolomic_final_severecases.gml")

In [None]:
mild_edges = list(mild.edges())
print(len(mild_edges))
severe_edges = list(severe.edges())
print(len(severe_edges))

In [None]:
#Example code from https://stackoverflow.com/questions/41125909/python-find-elements-in-one-list-that-are-not-in-the-other
list_1=["a", "b", "c", "d", "e"]
list_2=["a", "f", "c", "m"]
set(list_2) - set(list_1)

#set(['m', 'f'])

In [None]:
#Edges present in the mild network but not the severe network
len(set(mild_edges) - set(severe_edges))

In [None]:
#Edges present in the severe network but not the mild network
len(set(severe_edges) - set(mild_edges))

In [None]:
#Reading in the edges expressed in the differential metabolomic network
#with open('Data/permutation_test_files_metabolomics/sig_edges_beforefdr.txt') as f:
with open('Data/permutation_test_files_metabolomics/sig_edges.txt') as f:
    lines = f.readlines()

edges_remaining = []

edges = lines[0].split(",")

for index in range(0,len(edges),2):
    list1 = edges[index],(edges[index+1][1:]) #becomes a tuple
    edges_remaining.append(list1)

print(len(edges_remaining))
edges_remaining

In [None]:
intersection = list(set(edges_remaining).intersection(list(set(mild_edges))))  
len(intersection) #30 edges out of 98

In [None]:
intersection = list(set(edges_remaining).intersection(list(set(severe_edges)))) 
len(intersection) #6 edges out of 116