This script test the number of pathways and nodes present after we subset the 83 severe samples to 45 to match the mild cases. We run this 100 times. Using the same subset dataset we test for the whole dataset, and also the severe naive networks.

We make two different networks, one for the COVID cases 1-2 compared to COVID cases 3-7 <br>
This is because there are only 18 samples in common between the metabolomic and proteomic datasets

0       Common samples: 18           Metabolomic samples: 133        Proteomic samples: 123 <br>
1-2       Common samples: 45          Metabolomic samples: 45        Proteomic samples: 48 <br>
3-4       Common samples: 56          Metabolomic samples: 57        Proteomic samples: 59 <br>
5-7       Common samples: 27          Metabolomic samples: 28        Proteomic samples: 28 <br>

146 common samples overall,   128 cases, composed of (45 samples (WHO 1-2) vs 83 samples (WHO 3-7))

### Reading in the files

In [2]:
import pandas as pd
import sspa
import random
from sklearn.preprocessing import StandardScaler
import scipy
import numpy as np
import networkx as nx
import os

In [188]:
#Load the common cases dataset
original_df = pd.read_csv('Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
reactome_pathways = sspa.process_gmt("Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

root_path = pd.read_excel('Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())

### Subset the severe case samples and adding back onto the mild cases

In [149]:
df_mild = (original_df[original_df["WHO_status"] == '1-2']) #45 samples, no need to remove the metadata, since I do that in a later step
df_severe = (original_df[(original_df["WHO_status"] == '3-4') | (original_df["WHO_status"] == '5-7')]) #83 samples

In [None]:
df_severe

In [157]:
sample_list = []
while len(sample_list) < 45:
    subset = random.choice(df_severe.index.tolist())
    if subset not in sample_list:
        sample_list.append(subset)

In [None]:
len(sample_list)

In [158]:
#Subsetting the severe df to the sample list only
new_df_severe = df_severe.loc[df_severe.index.isin(sample_list), : ]
new_df_severe

Unnamed: 0_level_0,1372,16610,72665,27823,30915,37373,16831,545959,17050,16359,...,133693,133694,28036,28238,76341,89312,17861,89188,WHO_status,Group
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INCOV002,-0.125585,-1.002246,0.399273,-0.828341,-0.980576,0.373921,-1.076979,-0.131876,-0.869093,-0.467326,...,0.509336,0.627269,0.321718,-0.158514,0.001469,-1.051485,1.29933,0.57856,5-7,COVID19
INCOV005,-0.741957,0.387909,-0.711036,-0.711515,0.67072,0.610591,-0.322241,-0.793451,-1.230681,-0.246493,...,-0.184967,-0.469553,-0.293869,0.558366,-0.390308,-0.455737,-0.535223,-0.847727,3-4,COVID19
INCOV007,-0.136587,-0.393119,-0.294327,-0.628552,-0.177814,-0.260732,-0.525029,-0.717994,-0.692478,-0.106265,...,-0.309071,-0.101201,-0.327181,-0.566064,0.017056,0.091596,0.654008,-0.327013,3-4,COVID19
INCOV009,-1.023215,-0.40567,-0.234728,-0.060196,2.552796,2.480172,2.063091,0.729072,-0.319633,4.250094,...,-3.250139,-3.803128,1.156986,2.151032,3.104717,-0.824745,1.948632,-1.717798,3-4,COVID19
INCOV012,0.807189,0.450076,0.08752,-0.590381,1.757547,-0.68405,2.036407,-0.507747,-0.888873,-0.403608,...,-0.505135,-0.242446,-0.320456,-0.663736,0.912045,0.060167,1.167242,-0.057357,3-4,COVID19
INCOV013,-0.424609,-0.758431,0.52035,-1.197386,-0.364411,0.898,0.085133,-0.665907,-1.388233,-0.446587,...,0.113726,0.152841,-0.550163,-1.579064,1.076975,-0.511027,-0.971738,0.346868,5-7,COVID19
INCOV016,-0.628094,-1.172915,0.352961,-0.87594,0.409128,2.11576,-0.35829,0.004944,-1.104045,0.384876,...,0.075013,-0.29012,-0.473562,1.198909,-0.092835,0.036063,0.483396,-1.218819,3-4,COVID19
INCOV018,0.576373,-0.615139,-0.069942,-0.71118,3.787851,1.191754,0.094155,-0.5737,-1.224146,0.310379,...,-0.713934,-0.395035,0.026235,2.657141,-0.713727,-0.657965,-0.153286,-0.900097,3-4,COVID19
INCOV022,1.332761,-0.937254,-0.488329,-1.2572,-1.624926,1.063767,0.604112,-0.716277,-0.623717,-0.123209,...,1.011649,0.096449,0.396325,-1.864423,2.595319,-1.345479,0.666693,6.105379,5-7,COVID19
INCOV024,0.082036,-0.578039,0.168801,-0.803106,-0.384362,-1.003926,-0.818905,-0.083876,-0.868447,-0.277197,...,0.098121,-0.431793,-0.303784,-0.115667,-0.017763,-1.385399,0.795837,-0.333189,3-4,COVID19


In [159]:
df = pd.concat([df_mild, new_df_severe], axis=0)

In [160]:
df

Unnamed: 0_level_0,1372,16610,72665,27823,30915,37373,16831,545959,17050,16359,...,133693,133694,28036,28238,76341,89312,17861,89188,WHO_status,Group
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INCOV039,0.572681,2.947307,0.409855,0.657044,-0.517260,-1.244324,-0.630876,-0.196207,0.606751,-0.497386,...,-0.531280,-0.858450,0.303258,2.343951,-0.191630,-0.262584,-0.621128,0.129973,1-2,COVID19
INCOV042,1.518855,-1.466871,0.601883,-0.441534,-0.467355,-1.070796,-1.338015,-0.699688,-1.239490,-0.275223,...,0.397392,0.162043,-0.554701,-0.592935,-0.808188,-0.294972,0.299785,-0.568024,1-2,COVID19
INCOV056,-1.449867,3.630123,2.312075,-0.074582,-0.126757,-0.727925,-0.055171,0.619887,2.911204,-0.405538,...,-0.713392,-1.143747,-0.312240,-0.327092,-0.456647,0.259377,-1.057823,1.766340,1-2,COVID19
INCOV057,1.354767,1.710785,-0.201081,0.797807,-1.419633,-1.505004,-0.373426,0.168882,0.899152,-0.210482,...,0.059112,-0.270870,-0.507223,0.250231,-0.611088,-0.661087,-1.098477,0.162299,1-2,COVID19
INCOV058,-0.927053,-0.242781,0.046580,-0.559563,0.691076,-0.281057,3.991851,-0.811018,0.251642,-0.171523,...,-1.022084,-0.437192,1.068431,-1.077989,-0.473743,0.610574,-1.898998,1.564263,1-2,COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
INCOV124,0.717949,0.228993,-0.817579,0.087929,-0.311806,0.196728,-1.003197,-0.629954,-0.397692,0.009126,...,1.684218,1.406422,-0.245720,-0.514091,-0.199357,-0.846854,-0.032320,-0.943631,3-4,COVID19
INCOV133,0.106495,0.944466,-0.550708,-0.124598,0.115790,0.637512,0.464733,0.280852,0.545110,-0.489136,...,-0.066113,0.582042,-0.570565,-0.792820,0.092206,-0.492807,1.157865,-0.752052,3-4,COVID19
INCOV136,-0.174602,1.167644,0.055499,0.113342,-0.663851,0.799367,0.004697,0.748894,1.328753,0.391221,...,-0.741578,-0.730981,-0.380517,-1.251641,0.389163,-0.603048,0.941619,-0.511694,3-4,COVID19
INCOV137,0.274900,-0.014841,-0.928512,0.885263,-1.591862,0.230250,0.057818,-0.775962,-0.179700,-0.534786,...,-1.306132,-1.246428,1.763824,1.258387,0.448359,-0.508931,0.268474,-0.997206,5-7,COVID19


### Scaling the data

In [None]:
#df_norm = pd.DataFrame(StandardScaler().fit_transform(df_num),columns=df_num.columns, index=df_num.index)

#Add metadata to the end of the df
#df_final = pd.concat([df_norm, df.iloc[:,-2:]],axis=1) 
#df_final.to_csv('Data/Su_COVID_metabolomics_processed_commoncases.csv')

In [None]:
df_num  = df.iloc[:,:-2] #all rows, all columns apart from last two
df_norm = pd.DataFrame(StandardScaler().fit_transform(df_num),columns=df_num.columns, index=df_num.index)

In [None]:
#Check data is scaled
print(df_norm.max().max())
print(df_norm.min().min())
print(df_norm.mean(axis = 0)) #mean of 0
print(df_norm.std(axis = 0)) #sd of 1

In [None]:
df_final = pd.concat([df_norm, df.iloc[:,-2:]],axis=1) #add metadata back on

### Pathway analysis

In [None]:
kpca_scores = sspa.sspa_kpca(df_final.iloc[:,:-2], reactome_pathways)
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))

### Network analysis

Spearman correlation coefficient:

In [None]:
spearman_results = scipy.stats.spearmanr(kpca_scores)

spearman_coef = spearman_results[0] #correlation coefficients
spearman_pvals = spearman_results[1] #p-values


#Using Sara's code (rather than having separate dataframes for each analysis, add all together in long format)
squared_spearman_coef_df = pd.DataFrame(spearman_coef,columns = kpca_scores.columns, index=kpca_scores.columns)
squared_spearman_coef_list = squared_spearman_coef_df.stack().reset_index()
squared_spearman_coef_list.columns = ["Pathway1", "Pathway2", "Spearman_corr"]
squared_spearman_coef_list["Squared_corr"]  = np.square(squared_spearman_coef_list.Spearman_corr)

spearman_pvals_df = pd.DataFrame(spearman_pvals,columns = kpca_scores.columns, index=kpca_scores.columns)
spearman_pvals_list = spearman_pvals_df.stack().reset_index()
spearman_pvals_list.columns = ["Pathway1", "Pathway2", "pval"]

#Multiple testing correction for the p-values to prepare the corrected p-values for the final correlation network
#Multiplies by the correct number of tests (i.e. not including the duplicates or self-comparisons)
#Does not remove the diagonals or the duplicates themselves
# E.g. ((160x160)-160)  / 2   (if there are 160 pathways)
num_of_tests = (len(kpca_scores.columns)**2 - len(kpca_scores.columns))/2
print(num_of_tests)
corrected_spearman_pvals = spearman_pvals_list.pval*num_of_tests
#If the p-val goes beyond 1 (max number for a p-value, change to 1)
corrected_spearman_pvals = np.where(corrected_spearman_pvals < 1, corrected_spearman_pvals, 1)
spearman_pvals_list["pval_adj"]  = corrected_spearman_pvals

spearman_df = squared_spearman_coef_list.merge(spearman_pvals_list,on=["Pathway1","Pathway2"])

display(spearman_df)

Overlap coefficient:

In [None]:
#Obtain pathways and corresponding metabolites for all Reactome pathways, store as dictionary
orig_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)
#Filter out dictionary to retain only the pathways that remain after kPCA
my_keys = kpca_scores.columns
pathways_dict = {key: orig_dict[key] for key in my_keys}


#Filter out the compounds in the pathways that are not present in the dataset
#Obtain all unique values in dataset
compounds_present = list(df.columns[:-2])
filtered_dict = {} 

#My code adapted from Cecilia's
#If the key values are not part of the compounds in dataset then remove
for key,value in pathways_dict.items():
    new_val = [item for item in value if item in compounds_present]
    if len(new_val) >= 2: #at least two compounds in the pathway
        filtered_dict[key] = new_val


oc_matrix = np.zeros((len(my_keys),len(my_keys)))    

for i in range(0,len(my_keys)):   
    list1 = filtered_dict[my_keys[i]]
    
    for j in range(0,len(my_keys)):
        list2 = filtered_dict[my_keys[j]]

        # Szymkiewicz–Simpson coefficient
        #Find intersection between two lists
        intersection = len(list(set(list1).intersection(list(set(list2)))))
        smaller_set = min(len(list1), len(list2))

        val = intersection/smaller_set
        oc_matrix[i][j] = val 

oc_df = pd.DataFrame(oc_matrix, index=filtered_dict.keys(), columns=filtered_dict.keys())

In [None]:
oc_df

In [None]:
oc_list = oc_df.stack().reset_index()
oc_list.columns = ["Pathway1", "Pathway2", "Overlap_coef"]
spearman_df = spearman_df.merge(oc_list,on=["Pathway1","Pathway2"])

display(spearman_df)

Remove self-correlations:

In [None]:
spearman_df = spearman_df [spearman_df.Pathway1 != spearman_df.Pathway2]
spearman_df = spearman_df.reset_index(drop=True)

Construct the network graph:

In [None]:
final_df = spearman_df[spearman_df["pval_adj"] < 0.005]  
final_df = final_df[final_df["Overlap_coef"] < 0.5]
final_df = final_df.reset_index(drop=True) 
display(final_df) #the duplicate edges have not been removed yet

In [None]:
#Draw network graph with new edges
G = nx.Graph()
G = nx.from_pandas_edgelist(df=final_df, source='Pathway1', target='Pathway2', edge_attr='Squared_corr')
#G.add_nodes_from(isolated_nodes)
nx.draw(G, with_labels = True)
print(G.number_of_nodes())
print(G.number_of_edges())

In [None]:
list(G.nodes())

### Writing the results

In [None]:
index = 1

In [None]:
with open ('Data/sample_size_metabolomics/nodes/Run'+str(index)+'.txt', "w") as file:
     file.write(','.join(str(i) for i in list(G.nodes())))

In [None]:
with open ('Data/sample_size_metabolomics/edges/Run'+str(index)+'.txt', "w") as file:
     file.write(','.join(str(i) for i in list(G.edges())))

### Writing as a function

In [191]:
def subset_sample (original_df):
    df_mild = (original_df[original_df["WHO_status"] == '1-2']) #45 samples, no need to remove the metadata, since I do that in a later step
    df_severe = (original_df[(original_df["WHO_status"] == '3-4') | (original_df["WHO_status"] == '5-7')]) #83 samples

    sample_list = []
    while len(sample_list) < 45:
        subset = random.choice(df_severe.index.tolist())
        if subset not in sample_list:
            sample_list.append(subset)

    #Subsetting the severe df to the sample list only
    new_df_severe = df_severe.loc[df_severe.index.isin(sample_list), : ]


    #For the whole dataset:
    df = pd.concat([df_mild, new_df_severe], axis=0)
    df_num  = df.iloc[:,:-2] #all rows, all columns apart from last two
    df_norm = pd.DataFrame(StandardScaler().fit_transform(df_num),columns=df_num.columns, index=df_num.index)
    df_final = pd.concat([df_norm, df.iloc[:,-2:]],axis=1) #add metadata back on

    #For the severe dataset only:
    #df_num  = new_df_severe.iloc[:,:-2] #all rows, all columns apart from last two
    #df_norm = pd.DataFrame(StandardScaler().fit_transform(df_num),columns=df_num.columns, index=df_num.index)
    #df_final = pd.concat([df_norm, new_df_severe.iloc[:,-2:]],axis=1) #add metadata back on

    sample_list.sort()
    #print(sample_list)
    #print(len(df_final))
    #display(df_final)
    return df_final, sample_list

In [193]:
def network_construction (df_final):
    kpca_scores = sspa.sspa_kpca(df_final.iloc[:,:-2], reactome_pathways)
    kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))


    spearman_results = scipy.stats.spearmanr(kpca_scores)

    spearman_coef = spearman_results[0] #correlation coefficients
    spearman_pvals = spearman_results[1] #p-values


    #Using Sara's code (rather than having separate dataframes for each analysis, add all together in long format)
    squared_spearman_coef_df = pd.DataFrame(spearman_coef,columns = kpca_scores.columns, index=kpca_scores.columns)
    squared_spearman_coef_list = squared_spearman_coef_df.stack().reset_index()
    squared_spearman_coef_list.columns = ["Pathway1", "Pathway2", "Spearman_corr"]
    squared_spearman_coef_list["Squared_corr"]  = np.square(squared_spearman_coef_list.Spearman_corr)

    spearman_pvals_df = pd.DataFrame(spearman_pvals,columns = kpca_scores.columns, index=kpca_scores.columns)
    spearman_pvals_list = spearman_pvals_df.stack().reset_index()
    spearman_pvals_list.columns = ["Pathway1", "Pathway2", "pval"]

    #Multiple testing correction for the p-values to prepare the corrected p-values for the final correlation network
    #Multiplies by the correct number of tests (i.e. not including the duplicates or self-comparisons)
    #Does not remove the diagonals or the duplicates themselves
    # E.g. ((160x160)-160)  / 2   (if there are 160 pathways)
    num_of_tests = (len(kpca_scores.columns)**2 - len(kpca_scores.columns))/2
    
    corrected_spearman_pvals = spearman_pvals_list.pval*num_of_tests
    #If the p-val goes beyond 1 (max number for a p-value, change to 1)
    corrected_spearman_pvals = np.where(corrected_spearman_pvals < 1, corrected_spearman_pvals, 1)
    spearman_pvals_list["pval_adj"]  = corrected_spearman_pvals

    spearman_df = squared_spearman_coef_list.merge(spearman_pvals_list,on=["Pathway1","Pathway2"])


    #Obtain pathways and corresponding metabolites for all Reactome pathways, store as dictionary
    orig_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)
    #Filter out dictionary to retain only the pathways that remain after kPCA
    my_keys = kpca_scores.columns
    pathways_dict = {key: orig_dict[key] for key in my_keys}


    #Filter out the compounds in the pathways that are not present in the dataset
    #Obtain all unique values in dataset
    compounds_present = list(original_df.columns[:-2])
    filtered_dict = {} 

    #My code adapted from Cecilia's
    #If the key values are not part of the compounds in dataset then remove
    for key,value in pathways_dict.items():
        new_val = [item for item in value if item in compounds_present]
        if len(new_val) >= 2: #at least two compounds in the pathway
            filtered_dict[key] = new_val


    oc_matrix = np.zeros((len(my_keys),len(my_keys)))    

    for i in range(0,len(my_keys)):   
        list1 = filtered_dict[my_keys[i]]
        
        for j in range(0,len(my_keys)):
            list2 = filtered_dict[my_keys[j]]

            # Szymkiewicz–Simpson coefficient
            #Find intersection between two lists
            intersection = len(list(set(list1).intersection(list(set(list2)))))
            smaller_set = min(len(list1), len(list2))

            val = intersection/smaller_set
            oc_matrix[i][j] = val 

    oc_df = pd.DataFrame(oc_matrix, index=filtered_dict.keys(), columns=filtered_dict.keys())


    oc_list = oc_df.stack().reset_index()
    oc_list.columns = ["Pathway1", "Pathway2", "Overlap_coef"]
    spearman_df = spearman_df.merge(oc_list,on=["Pathway1","Pathway2"])

    spearman_df = spearman_df [spearman_df.Pathway1 != spearman_df.Pathway2]
    spearman_df = spearman_df.reset_index(drop=True)

    return spearman_df

In [239]:
def network_test(spearman_df):
    final_df = spearman_df[spearman_df["pval_adj"] < 0.005]  
    final_df = final_df[final_df["Overlap_coef"] < 0.5]
    final_df = final_df.reset_index(drop=True) 

    final_df_copy = final_df.copy()


    #display(final_df)
    #Draw network graph with new edges
    G = nx.Graph()
    G = nx.from_pandas_edgelist(df=final_df_copy, source='Pathway1', target='Pathway2', edge_attr='Squared_corr')
    #G.add_nodes_from(isolated_nodes)
    #nx.draw(G, with_labels = True)
    print(G.number_of_nodes())
    print(G.number_of_edges())

    return list(G.nodes()), list(G.edges())


In [240]:
df_final,sample_list1 = subset_sample(original_df)
spearman_df1 = network_construction(df_final)
nodes1,edges1 = network_test(spearman_df1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_copy.Pathway1[i] = min(val1,val2)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_copy.Pathway2[i] = max(val1,val2)


91
592


In [196]:
df_final,sample_list2 = subset_sample(original_df)
spearman_df2 = network_construction(df_final)
nodes2,edges2 = network_test(spearman_df2)

95
573


In [201]:
sample_list1 == sample_list2

False

In [283]:
print(len(set(sample_list1) & set(sample_list2))) #could have as little as 7 samples in common
print(len(list(set(sample_list1) - set(sample_list2)))) #could have a difference of up to 40 samples

27
18


In [241]:
spearman_df1

Unnamed: 0,Pathway1,Pathway2,Spearman_corr,Squared_corr,pval,pval_adj,Overlap_coef
0,R-HSA-110331,R-HSA-112310,0.122015,0.014888,2.519431e-01,1.000000e+00,0.0
1,R-HSA-110331,R-HSA-112311,-0.155073,0.024048,1.444473e-01,1.000000e+00,0.0
2,R-HSA-110331,R-HSA-112315,0.211738,0.044833,4.513203e-02,1.000000e+00,0.0
3,R-HSA-110331,R-HSA-1237112,-0.797292,0.635674,5.366310e-21,5.525153e-17,0.5
4,R-HSA-110331,R-HSA-1368082,-0.098193,0.009642,3.571817e-01,1.000000e+00,0.0
...,...,...,...,...,...,...,...
20587,R-HSA-975634,R-HSA-9717207,0.248928,0.061965,1.798643e-02,1.000000e+00,0.0
20588,R-HSA-975634,R-HSA-9734207,-0.091740,0.008416,3.898061e-01,1.000000e+00,0.0
20589,R-HSA-975634,R-HSA-9735804,-0.158546,0.025137,1.355589e-01,1.000000e+00,0.0
20590,R-HSA-975634,R-HSA-9749641,0.206124,0.042487,5.128148e-02,1.000000e+00,0.0


In [285]:
spearman_df2

Unnamed: 0,Pathway1,Pathway2,Spearman_corr,Squared_corr,pval,pval_adj,Overlap_coef
0,R-HSA-110331,R-HSA-112310,-0.082932,0.006878,4.370970e-01,1.000000e+00,0.0
1,R-HSA-110331,R-HSA-112311,-0.079030,0.006246,4.590413e-01,1.000000e+00,0.0
2,R-HSA-110331,R-HSA-112315,-0.118657,0.014079,2.653303e-01,1.000000e+00,0.0
3,R-HSA-110331,R-HSA-1237112,0.850829,0.723911,2.529261e-26,2.604127e-22,0.5
4,R-HSA-110331,R-HSA-1368082,0.125670,0.015793,2.379091e-01,1.000000e+00,0.0
...,...,...,...,...,...,...,...
20587,R-HSA-975634,R-HSA-9717207,0.428374,0.183504,2.530880e-05,2.605795e-01,0.0
20588,R-HSA-975634,R-HSA-9734207,0.067671,0.004579,5.262533e-01,1.000000e+00,0.0
20589,R-HSA-975634,R-HSA-9735804,-0.123431,0.015235,2.464407e-01,1.000000e+00,0.0
20590,R-HSA-975634,R-HSA-9749641,0.306285,0.093810,3.325143e-03,1.000000e+00,0.0


In [286]:
spearman_df1 == spearman_df2

Unnamed: 0,Pathway1,Pathway2,Spearman_corr,Squared_corr,pval,pval_adj,Overlap_coef
0,True,True,False,False,False,True,True
1,True,True,False,False,False,True,True
2,True,True,False,False,False,True,True
3,True,True,False,False,False,False,True
4,True,True,False,False,False,True,True
...,...,...,...,...,...,...,...
20587,True,True,False,False,False,False,True
20588,True,True,False,False,False,True,True
20589,True,True,False,False,False,True,True
20590,True,True,False,False,False,True,True


In [287]:
nodes1 == nodes2

False

In [288]:
edges1 ==  edges2

False

Writing out as a loop:

In [18]:
for index in range(1,11):
    df_final,sample_list = subset_sample(original_df)
    spearman_df = network_construction(df_final)
    nodes,edges = network_test(spearman_df)

    with open ('Data/sample_size_metabolomics/nodes/Run'+str(index)+'.txt', "w") as file:
        file.write(','.join(str(i) for i in nodes))
    with open ('Data/sample_size_metabolomics/edges/Run'+str(index)+'.txt', "w") as file:
        file.write(','.join(str(i) for i in edges))

92
389
90
529
95
639
87
426
91
540
97
669
97
527
100
572
103
653
109
669


In [3]:
path = os.getcwd() + '\\Data\\sample_size_metabolomics\\nodes'

In [4]:
final_list_nodes = []

for filename in os.listdir(path): #also lists directories
    if filename.startswith('Run'):
        file_num = int(filename[3:-4])  #obtain the number of the Run
        if file_num < 11:
            print(filename)
            with open(os.path.join(path, filename)) as file: # open in readonly mode
                lines = file.readlines()
                vals = lines[0].split(',')
                vals =  [x for x in vals]
            final_list_nodes.append(vals)

Run1.txt
Run10.txt
Run2.txt
Run3.txt
Run4.txt
Run5.txt
Run6.txt
Run7.txt
Run8.txt
Run9.txt


In [5]:
final_list_nodes[0]

['R-HSA-112310',
 'R-HSA-192456',
 'R-HSA-196854',
 'R-HSA-1989781',
 'R-HSA-2262752',
 'R-HSA-400206',
 'R-HSA-418594',
 'R-HSA-556833',
 'R-HSA-6806667',
 'R-HSA-74182',
 'R-HSA-77108',
 'R-HSA-77111',
 'R-HSA-8935690',
 'R-HSA-8957322',
 'R-HSA-8978868',
 'R-HSA-112311',
 'R-HSA-1483255',
 'R-HSA-6814848',
 'R-HSA-112315',
 'R-HSA-189200',
 'R-HSA-372790',
 'R-HSA-373076',
 'R-HSA-388396',
 'R-HSA-500792',
 'R-HSA-71387',
 'R-HSA-1368082',
 'R-HSA-1368108',
 'R-HSA-1428517',
 'R-HSA-9717189',
 'R-HSA-1483206',
 'R-HSA-1660661',
 'R-HSA-428157',
 'R-HSA-1483257',
 'R-HSA-156580',
 'R-HSA-211897',
 'R-HSA-211945',
 'R-HSA-156582',
 'R-HSA-420499',
 'R-HSA-9717207',
 'R-HSA-1592230',
 'R-HSA-1614603',
 'R-HSA-2142753',
 'R-HSA-425366',
 'R-HSA-425393',
 'R-HSA-425407',
 'R-HSA-597592',
 'R-HSA-9749641',
 'R-HSA-1614635',
 'R-HSA-425397',
 'R-HSA-73884',
 'R-HSA-73929',
 'R-HSA-1655829',
 'R-HSA-2408508',
 'R-HSA-2408522',
 'R-HSA-433692',
 'R-HSA-5619070',
 'R-HSA-71291',
 'R-HSA-29807

In [6]:
path = os.getcwd() + '\\Data\\sample_size_metabolomics\\edges'

In [7]:
final_list_edges = []

for filename in os.listdir(path): #also lists directories
    if filename.startswith('Run'):
        file_num = int(filename[3:-4])  #obtain the number of the Run
        if file_num < 11:
            print(filename)
            with open(os.path.join(path, filename)) as file: # open in readonly mode
                lines = file.readlines()
                #vals = lines[0].split('),')
                #vals =  [x for x in vals]

                vals = []
                edges = lines[0].split(",")
                for index in range(0,len(edges),2):
                    list1 = edges[index][2:-1],(edges[index+1][2:-2])
                    vals.append(tuple(list1))
                
            final_list_edges.append(vals)

Run1.txt
Run10.txt
Run2.txt
Run3.txt
Run4.txt
Run5.txt
Run6.txt
Run7.txt
Run8.txt
Run9.txt


Get the pathways and edges in common between all:

In [230]:
result = set(final_list_nodes[0])
for list1 in final_list_nodes[1:]:
    result.intersection_update(list1)
    
result = list(result)
print(result)

['R-HSA-77111', 'R-HSA-556833', 'R-HSA-6806667', 'R-HSA-425397', 'R-HSA-74182', 'R-HSA-6782315', 'R-HSA-1592230', 'R-HSA-1368108', 'R-HSA-1655829', 'R-HSA-192456', 'R-HSA-418594', 'R-HSA-5619108', 'R-HSA-1660661', 'R-HSA-400508', 'R-HSA-6782861', 'R-HSA-1989781', 'R-HSA-425393', 'R-HSA-2142753', 'R-HSA-1483206', 'R-HSA-500792', 'R-HSA-196854', 'R-HSA-8978868', 'R-HSA-112310', 'R-HSA-5619115', 'R-HSA-2426168', 'R-HSA-2187338', 'R-HSA-9707616', 'R-HSA-9711123', 'R-HSA-189200', 'R-HSA-2408508', 'R-HSA-73884', 'R-HSA-112315', 'R-HSA-9707564', 'R-HSA-1614603', 'R-HSA-597592', 'R-HSA-72306', 'R-HSA-72312', 'R-HSA-6814848', 'R-HSA-804914', 'R-HSA-196849', 'R-HSA-381340', 'R-HSA-416476', 'R-HSA-5619102', 'R-HSA-8957322', 'R-HSA-1483255', 'R-HSA-388396', 'R-HSA-425407', 'R-HSA-381771', 'R-HSA-73614', 'R-HSA-2151201', 'R-HSA-2980736', 'R-HSA-77108', 'R-HSA-2408522', 'R-HSA-1368082', 'R-HSA-372790', 'R-HSA-373076', 'R-HSA-425366', 'R-HSA-70171', 'R-HSA-5619063', 'R-HSA-444209', 'R-HSA-73929', 'R-

In [231]:
#Source: https://stackoverflow.com/questions/30902558/finding-length-of-the-longest-list-in-an-irregular-list-of-lists

list_len = [len(i) for i in final_list_nodes]
print(max(list_len))
print(min(list_len))

109
87


In [9]:
len(result)

71

In [234]:
result = set(final_list_edges[0])
for list1 in final_list_edges[1:]:
    result.intersection_update(list1)
    
result = list(result)
print(result)

[('R-HSA-8978868', 'R-HSA-112315'), ('R-HSA-2262752', 'R-HSA-77108'), ('R-HSA-400206', 'R-HSA-416476'), ('R-HSA-192456', 'R-HSA-373076'), ('R-HSA-8978868', 'R-HSA-381771'), ('R-HSA-112310', 'R-HSA-196854'), ('R-HSA-71387', 'R-HSA-9711123'), ('R-HSA-1989781', 'R-HSA-74182'), ('R-HSA-77108', 'R-HSA-373076'), ('R-HSA-500792', 'R-HSA-5619115'), ('R-HSA-196854', 'R-HSA-373076'), ('R-HSA-77108', 'R-HSA-9707616'), ('R-HSA-77111', 'R-HSA-1368082'), ('R-HSA-112315', 'R-HSA-373076'), ('R-HSA-556833', 'R-HSA-500792'), ('R-HSA-6806667', 'R-HSA-373076'), ('R-HSA-8978868', 'R-HSA-400508'), ('R-HSA-77108', 'R-HSA-2187338'), ('R-HSA-192456', 'R-HSA-400206'), ('R-HSA-8957322', 'R-HSA-112315'), ('R-HSA-2262752', 'R-HSA-372790'), ('R-HSA-77108', 'R-HSA-1655829'), ('R-HSA-8957322', 'R-HSA-381771'), ('R-HSA-112310', 'R-HSA-6806667'), ('R-HSA-77108', 'R-HSA-8957322'), ('R-HSA-196854', 'R-HSA-8957322'), ('R-HSA-8935690', 'R-HSA-373076'), ('R-HSA-8978868', 'R-HSA-444209'), ('R-HSA-74182', 'R-HSA-1368082'), ('

In [235]:
list_len = [len(i) for i in final_list_edges]
print(max(list_len))
print(min(list_len))

669
389


In [236]:
len(result)

231

Comparing with original naive network:

In [178]:
metabolomic = nx.read_gml("Cytoscape/metabolomic_final_commoncases.gml")

In [179]:
print(metabolomic.number_of_nodes())
print(metabolomic.number_of_edges())

117
724


In [180]:
naive_all = list(metabolomic.nodes())
len(set(naive_all) & set(result))

0

In [181]:
metabolomic.edges() #final

EdgeView([('R-HSA-110331', 'R-HSA-2161522'), ('R-HSA-110331', 'R-HSA-2161541'), ('R-HSA-110331', 'R-HSA-418594'), ('R-HSA-110331', 'R-HSA-5683826'), ('R-HSA-2161522', 'R-HSA-73927'), ('R-HSA-2161541', 'R-HSA-73927'), ('R-HSA-418594', 'R-HSA-112310'), ('R-HSA-418594', 'R-HSA-112315'), ('R-HSA-418594', 'R-HSA-15869'), ('R-HSA-418594', 'R-HSA-1614603'), ('R-HSA-418594', 'R-HSA-189200'), ('R-HSA-418594', 'R-HSA-196849'), ('R-HSA-418594', 'R-HSA-196854'), ('R-HSA-418594', 'R-HSA-425366'), ('R-HSA-418594', 'R-HSA-425393'), ('R-HSA-418594', 'R-HSA-425397'), ('R-HSA-418594', 'R-HSA-425407'), ('R-HSA-418594', 'R-HSA-556833'), ('R-HSA-418594', 'R-HSA-561048'), ('R-HSA-418594', 'R-HSA-5668914'), ('R-HSA-418594', 'R-HSA-597592'), ('R-HSA-418594', 'R-HSA-71291'), ('R-HSA-418594', 'R-HSA-71387'), ('R-HSA-418594', 'R-HSA-73884'), ('R-HSA-418594', 'R-HSA-73927'), ('R-HSA-418594', 'R-HSA-73929'), ('R-HSA-418594', 'R-HSA-74182'), ('R-HSA-418594', 'R-HSA-74217'), ('R-HSA-418594', 'R-HSA-74259'), ('R-HSA-

In [183]:
naive_all = list(metabolomic.edges())
len(set(sorted(naive_all)) & set(result))

231

In [159]:
set(result)

{('R-HSA-112310', 'R-HSA-192456'),
 ('R-HSA-112310', 'R-HSA-196854'),
 ('R-HSA-112310', 'R-HSA-1989781'),
 ('R-HSA-112310', 'R-HSA-2262752'),
 ('R-HSA-112310', 'R-HSA-400206'),
 ('R-HSA-112310', 'R-HSA-556833'),
 ('R-HSA-112310', 'R-HSA-6806667'),
 ('R-HSA-112310', 'R-HSA-74182'),
 ('R-HSA-112310', 'R-HSA-77108'),
 ('R-HSA-112310', 'R-HSA-77111'),
 ('R-HSA-112310', 'R-HSA-8935690'),
 ('R-HSA-112310', 'R-HSA-8957322'),
 ('R-HSA-112310', 'R-HSA-8978868'),
 ('R-HSA-112315', 'R-HSA-372790'),
 ('R-HSA-112315', 'R-HSA-373076'),
 ('R-HSA-112315', 'R-HSA-388396'),
 ('R-HSA-112315', 'R-HSA-500792'),
 ('R-HSA-192456', 'R-HSA-112315'),
 ('R-HSA-192456', 'R-HSA-1483206'),
 ('R-HSA-192456', 'R-HSA-196854'),
 ('R-HSA-192456', 'R-HSA-1989781'),
 ('R-HSA-192456', 'R-HSA-2262752'),
 ('R-HSA-192456', 'R-HSA-2980736'),
 ('R-HSA-192456', 'R-HSA-373076'),
 ('R-HSA-192456', 'R-HSA-381771'),
 ('R-HSA-192456', 'R-HSA-400206'),
 ('R-HSA-192456', 'R-HSA-400508'),
 ('R-HSA-192456', 'R-HSA-444209'),
 ('R-HSA-1924