Spearman correlation coefficient p-values

In [None]:
#Spearman correlation coefficient

#OUTPUT IS A DATAFRAME
#SEE CELL BELOW FOR OUTPUT AS NUMPY ARRAY

#Mask the upper half of the dataframe
mask =  spearman_pvals
mask = np.triu(np.ones(mask.shape)).astype(bool)
mask = np.invert(mask) #invert true and false values so the diagonal is False as well
#display(pd.DataFrame(mask[:8,:8]))


non_dup_spearman_pvals = pd.DataFrame(spearman_pvals)
non_dup_spearman_pvals = non_dup_spearman_pvals.where(mask) #Replace all false values with NaN using mask
#Uncorrected p-values in dataframe format
#display(non_dup_spearman_pvals)


#Corrected p-values in dataframe format
corrected_non_dup_spearman_pvals = non_dup_spearman_pvals*12720 
#12720 values = ((160*160) - 160 (e.g.Gene 1 compared to Gene 1)) divided by 2
corrected_non_dup_spearman_pvals = pd.DataFrame(data=corrected_non_dup_spearman_pvals.values, columns=kpca_scores.columns,index=kpca_scores.columns)
display(corrected_non_dup_spearman_pvals)



Final correlation network - Removing duplicates in the edge list

In [None]:
#Remove duplicates in the edge list

edgelist_copy = edgelist.copy()

#Remove duplicate pathways
for i in range(0,len(edgelist)):
    val1 = edgelist.Pathway1[i]
    val2 = edgelist.Pathway2[i]
    #print(val1,val2)
    #print(max(val1,val2))
    edgelist_copy.Pathway1[i] = min(val1,val2)
    edgelist_copy.Pathway2[i] = max(val1,val2)


edgelist_copy = edgelist_copy.sort_values(['Pathway1','Pathway2'], ascending=True)

edgelist_copy = edgelist_copy[::2]
display(edgelist_copy[:20])


Calculating percentage of significant values when I use the full number of test (n x n) rather than multiplying by the number if independent tests

In [None]:
#Perform multiple testing correction on p-values without the duplicates being removed (just to check how much accounting for duplicates affects the number of metabolites retained)

#Work out how many values are significant or not significant before multiple testing correction 
#Note: This is to check percentage of significant values BEFORE DUPLICATES ARE REMOVED
print("Number of significant values before correction and before duplicate removal:",(spearman_pvals < 0.005).sum() )
print("Number of non-significant values before correction and before duplicate removal:",(spearman_pvals >= 0.005).sum())


#Using statsmodels package to do multiple testing correction
#Remember to flatten dataframe into one-dimensional array
corrected_pvals = statsmodels.stats.multitest.multipletests(spearman_pvals.flatten(), alpha=0.05, method='bonferroni')
sig_vals = corrected_pvals[1]  #[0] is boolean, [1] are p-values
#display(sig_vals)

print("Number of significant values:",(sig_vals < 0.005).sum())
print("Number of non-significant values:", (sig_vals >= 0.005).sum())

#----------------------------------------------------------------


#Perform multiple testing correlation on p-values after the duplicates are removed

#Checking how many values are significant after multiple testing correction

#Mask the upper half of the dataframe
mask =  spearman_pvals
mask = np.triu(np.ones(mask.shape)).astype(bool)
mask = np.invert(mask)

#Calculates the corrected p-values as a one-dimensional numpy array
non_dup_spearman_pvals = spearman_pvals[mask]

#https://tedboy.github.io/statsmodels_doc/generated/statsmodels.stats.multitest.multipletests.html#statsmodels.stats.multitest.multipletests
print("Number of significant values before correction and after duplicate removal:",sum(i < 0.005 for i in non_dup_spearman_pvals))
print("Number of non-significant values before correction and after duplicate removal:",sum(i >= 0.005 for i in non_dup_spearman_pvals))
#print(non_dup_spearman_pvals)

#Using statsmodels package
corrected_non_dup_pvals = statsmodels.stats.multitest.multipletests(non_dup_spearman_pvals, method='bonferroni')
sig_vals = corrected_non_dup_pvals[1]
#display(sig_vals)

print("Number of significant values:",(sig_vals < 0.005).sum())
print("Number of non-significant values:", (sig_vals >= 0.005).sum())

In [None]:
#Checking how many values are significant after multiple testing correction

#Mask the upper half of the dataframe
mask =  spearman_pvals
mask = np.triu(np.ones(mask.shape)).astype(bool)
mask = np.invert(mask)

#Calculates the corrected p-values as a one-dimensional numpy array
non_dup_spearman_pvals = spearman_pvals[mask]

#https://tedboy.github.io/statsmodels_doc/generated/statsmodels.stats.multitest.multipletests.html#statsmodels.stats.multitest.multipletests
print("Number of significant values before correction:",sum(i < 0.005 for i in non_dup_spearman_pvals))
print("Number of non-significant values before correction:",sum(i >= 0.005 for i in non_dup_spearman_pvals))
#print(non_dup_spearman_pvals)

#Using statsmodels package
corrected_non_dup_pvals = statsmodels.stats.multitest.multipletests(non_dup_spearman_pvals, method='bonferroni')
sig_vals = corrected_non_dup_pvals[1]
#display(sig_vals)

print("Number of significant values:",(sig_vals < 0.005).sum())
print("Number of non-significant values:", (sig_vals >= 0.005).sum())

Making an edgelist for NetworkX from a dataframe

In [None]:

#Add the nodes
G = nx.Graph()
G.add_nodes_from(oc_df.columns)

#Cecilia's code

#does not deal with duplicates i.e.  Pathway 1 to Pathway 2 and vice versa
#but it's ok, because it shows up as one edge on the network in Cytoscape

np.fill_diagonal(oc_df.values, np.nan) #Make values on the diagonal NaN 
edgelist_oc = oc_df.stack().reset_index()
edgelist_oc.columns = ['Pathway1', 'Pathway2', 'Weight'] 
G = nx.from_pandas_edgelist(df=edgelist_oc, source='Pathway1', target='Pathway2', edge_attr='Weight')

nx.draw(G, with_labels = True)
#nx.write_gml(G, "metabolomic_oc.gml")

### Code for reading in the permutation files and calculating p-values for each pathway pair

Permutation distribution from files that have been 'pickled' (using pickle package):

In [None]:

import pandas as pd
import os
import pickle

#Download the initial test statistics
df = pd.read_csv('metabolomics/Data/initial_tstats.csv', index_col=0)

path = os.getcwd() + '/metabolomics/Results'

final_list = []

for filename in os.listdir(path):
    if filename != 'initial_tstats.csv':
        with open(os.path.join(path, filename), 'rb') as file: # open in readonly mode
            list1 = pickle.load(file)
        final_list.append(list1)



sig_edges = []
edgelist = df.index

for index,pathway_pair in enumerate(edgelist):   #test all pathways
    comparison = df.Initial_tstat[index]    #get initial test statistic
    counter = 0
    
    for list1 in final_list:  #len(final_list) = number of permutations
        if abs(list1[index]) >= comparison:   
            counter += 1
    
    p_val = (counter/len(final_list))    #divide number of tests above or equal to the test statistic by total number of tests

    if p_val < 0.01:  #for 100,000 permutations
        sig_edges.append(pathway_pair)  
        
print(len(sig_edges))

with open("metabolomics/Data/sig_edges.txt", "wb") as file:   #Pickling (stores as list format instead of string, easier to read in later)
      pickle.dump(sig_edges , file)

Iterating through each permutation file for each pathway, stores into one list then outputs pathway pair if significant

In [None]:

import pandas as pd
import os


#Download the initial test statistics
df = pd.read_csv('metabolomics/Data/initial_tstats.csv', index_col=0)

path = os.getcwd() + '/metabolomics/Results'

edgelist = df.index


pathway_list = []
sig_edges = []

for index,pathway_pair in enumerate(edgelist[:5]):   #test all pathways
    for filename in os.listdir(path):
        if filename != 'initial_tstats.csv':
            with open(os.path.join(path, filename)) as file:    
                lines = file.readlines()
                pathway_stat = float(lines[0].split(',')[index])
                pathway_list.append(pathway_stat)
    
    print(pathway_list, pathway_pair)

    comparison = df.Initial_tstat[index]    #get initial test statistic
    print(comparison)
    counter = 0
    
    for num in pathway_list:  #len(pathway_list) = number of permutations
        if abs(num) >= comparison:   
            counter += 1
    
    print(counter)
    p_val = (counter/len(pathway_list))    #divide number of tests above or equal to the test statistic by total number of tests

    if p_val < 0.01:  #for 100,000 permutations
        sig_edges.append(pathway_pair)  

    pathway_list = []

print(sig_edges)

with open ('metabolomics/Data/sig_edges.txt', 'w') as file:
     file.write(','.join(str(i) for i in sig_edges))

Calculting number of absolute values above test statistic for metabolomic data, I had all 100k files in a folder (but changed to placing 10k into 10 instead):

In [None]:
import pandas as pd
import os
import random
import pickle

#Download the initial test statistics
df = pd.read_csv('metabolomics/Data/initial_tstats.csv', index_col=0)

path = os.getcwd() + '/metabolomics/Results_pickled_100k'

pathway_list = []


#Get the permutation values for randomly chosen pathway pairs

index = random.randrange(0, len(df)) #Gives index from 0 to (len(df)-1), better for indexing with unpickled files which are stored as lists
for filename in os.listdir(path):
        if filename.startswith('Run'):
            with open(os.path.join(path, filename),'rb') as file:
                list1 = pickle.load(file)   
                pathway_list.append(list1[index])
with open('metabolomics/Data/test_distribution'+str(index)+'.txt', 'w') as file:
    file.write(','.join(str(i) for i in pathway_list))
pathway_list = []


