### Comparing the difference in edges with other networks

### Comparing condition-specific networks (mild vs severe) with the full correlation results and then constructing the naive difference network:

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import sspa

In [None]:
mild_network = nx.read_gml("../Cytoscape/metabolomic_final_mildcases.gml")
mild_nodes = list(mild_network.nodes())

severe_network = nx.read_gml("../Cytoscape/metabolomic_final_severecases.gml")
severe_nodes = list(severe_network.nodes())

G = nx.read_gml("../Cytoscape/metabolomic_final_commoncases.gml")
all_edges = list(G.edges())
all_nodes = list(G.nodes())
len(all_edges)

In [None]:
mild_network = nx.read_gml("../Cytoscape/proteomic_final_mildcases.gml")
mild_nodes = list(mild_network.nodes())

severe_network = nx.read_gml("../Cytoscape/proteomic_final_severecases.gml")
severe_nodes = list(severe_network.nodes())

G = nx.read_gml("../Cytoscape/proteomic_final_commoncases.gml")
all_edges = list(G.edges())
all_nodes = list(G.nodes())
len(all_edges)

In [2]:
mild_network = nx.read_gml("../Cytoscape/integrated_final_mildcases.gml")
mild_nodes = list(mild_network.nodes())

severe_network = nx.read_gml("../Cytoscape/integrated_final_severecases.gml")
severe_nodes = list(severe_network.nodes())

G = nx.read_gml("../Cytoscape/integrated_final_commoncases.gml")
all_edges = list(G.edges())
all_nodes = list(G.nodes())
len(all_edges)

90096

In [3]:
#Confirm that the mild and severe nodes DO NOT have new nodes that the full network doesn't
print(len(all_nodes))
print(len(mild_nodes))
print(len(severe_nodes))
print(len(list(set(all_nodes).intersection(set(mild_nodes)))))
print(len(list(set(all_nodes).intersection(set(severe_nodes)))))

#Metabolomic severe network has 3 new nodes  #{'R-HSA-211859', 'R-HSA-5663205', 'R-HSA-9734207'}
#Proteomic same nodes for all
#Integrated mild network has 4 new nodes and severe network has 3 new nodes  #{'R-HSA-192105', 'R-HSA-193368', 'R-HSA-211976', 'R-HSA-5619084'} {'R-HSA-174824', 'R-HSA-83936', 'R-HSA-8956321'} 

666
424
589
420
586


In [5]:
#Find nodes which are present in mild/severe network not present in full network
set((mild_nodes)).difference(set(all_nodes).intersection(set(mild_nodes)))
set((severe_nodes)).difference(set(all_nodes).intersection(set(severe_nodes)))

{'R-HSA-174824', 'R-HSA-83936', 'R-HSA-8956321'}

In [6]:
mild_edges = list(mild_network.edges())
print(len(mild_edges))
severe_edges = list(severe_network.edges())
print(len(severe_edges))

6421
78106


In [None]:
#Example code from https://stackoverflow.com/questions/41125909/python-find-elements-in-one-list-that-are-not-in-the-other
list_1=["a", "b", "c", "d", "e"]
list_2=["a", "f", "c", "m"]
set(list_2) - set(list_1)

#set(['m', 'f'])

In [7]:
#Edges present in the mild network but not the severe network
print(len(list(set(mild_edges) - set(severe_edges))))
mild_naive = list(set(mild_edges) - set(severe_edges))

2973


In [8]:
#Edges present in the severe network but not the mild network
print(len(list(set(severe_edges) - set(mild_edges))))
severe_naive = list(set(severe_edges) - set(mild_edges))

74658


In [9]:
#Edges present in both mild and severe
print(len(list(set(severe_edges) & set(mild_edges))))
mild_severe_naive = list(set(severe_edges) & set(mild_edges))

3448


In [10]:
print(len(list(set(mild_naive) - set(all_edges)))) #Edges in the mild network (but not severe network) not in the full network

print(len(list(set(severe_naive) - set(all_edges)))) #Edges in the severe network (but not mild network) not in the full network

print(len(list(set(mild_severe_naive) - set(all_edges)))) #Edges in the mild network AND severe network not in the full network

2330
8918
354


In [None]:
mild_dict = {mild_naive[i]:"Mild" for i in range(0,len(mild))}
severe_dict = {severe_naive[i]:"Severe" for i in range(0,len(severe))}

condition_dict = mild_dict.copy()
for key, value in severe_dict.items():
    condition_dict[key] = value

print(condition_dict)

In [None]:
G=nx.from_edgelist(mild_naive+severe_naive)
len(G.edges())
nx.set_edge_attributes(G, condition_dict, "Condition")

In [None]:
#IF YOU DON'T REMOVE THE EDGES, DON'T RUN THIS CODE

#Add the betweenness centrality as a node attribute
betweenness= nx.betweenness_centrality(G, normalized=True) #output as dictionary
#display(betweenness)
nx.set_node_attributes(G, betweenness, "betweenness")

#Add the betweenness centrality as a node attribute
degree= nx.degree_centrality(G) #output as dictionary
#display(degree)
nx.set_node_attributes(G, degree, "degrees")


#Check
print(G.number_of_nodes()) 
#G.nodes['R-HSA-110331']#["betweenness"]

In [None]:
#Using Cecilia's code

#Creating graph
hierarchy = pd.read_csv('../Data/ReactomePathwaysRelation.txt', sep='\t', header=None)

#From the pathways, subset to Homo sapiens only
hierarchy_hsa = hierarchy[hierarchy[0].str.contains('HSA')]

#Return unique values in the first column that is not in the second column as a numpy array
#These values are not child pathways in any instances
hierarchy_hsa_parents = np.setdiff1d(hierarchy_hsa[0], hierarchy_hsa[1])

#Add the unique values not in the second column as a second attached dataset to the bottom of the original data
#The first column represents the parent column, the second column is the child column
hierarchy_hsa_all = pd.concat([hierarchy_hsa, pd.DataFrame([hierarchy_hsa_parents, hierarchy_hsa_parents], index=[0, 1]).T])

#DiGraph is a directed graph
H = nx.from_pandas_edgelist(hierarchy_hsa, source=0, target=1, create_using=nx.DiGraph())


In [None]:
#Convert pathway ID to name
root_path = pd.read_excel('../Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}

#Using Cecilia's code

#Find the root pathway

def find_root(H,child):
    #Find parent from child 
    parent = list(H.predecessors(child))

    #Keep the loop going until the highest level is reached
    if len(parent) == 0:
        return child
    else:  
        return find_root(H, parent[0])

hierarchy_hsa_all['Root'] = [find_root(H, i) for i in hierarchy_hsa_all[1]]

hierarchy_hsa_all.columns = ['Parent', 'Child', 'Root']

#There are 83 instances of duplicates, however all the child duplicates have the same root (even though different parents) after checking

root_pathways = {}
for pathway in list(H.nodes):
    index = hierarchy_hsa_all.Child[hierarchy_hsa_all.Child == pathway].index.tolist()[0]
    root_pathway  = hierarchy_hsa_all.Root[index]
    label = root_pathway_dict[root_pathway]
    root_pathways[pathway] = label


In [None]:
#Using Cecilia's code
 
#Shows all the root pathways in Reactome
set(hierarchy_hsa_all['Root'] )
#Shows all the root pathways present in the original dataset
set(root_pathways.values())

nx.set_node_attributes(G, root_pathways, "root_pathway")

print(G.number_of_nodes()) 

In [None]:
#Clustering with Louvain algorithm

#For some reason, not sure if really clustering by weight since it works with a typo 
#I think it works though, since changing the name gives a diff num of clusters even with seed set
#Resolution = 1 is the default, increasing resolution will yield more communities
louvain_clusters = nx.community.louvain_communities(G, weight='Squared_corr',seed=100,resolution=1.2)
print(len(louvain_clusters))

louvain_dict = {}
for index,grouping in enumerate(louvain_clusters):
    for pathway in grouping:
        louvain_dict[pathway] = index+1

nx.set_node_attributes(G, louvain_dict, "louvain")

In [None]:
#Assign pathway name to node

#Metabolomic:
#reactome_pathways = sspa.process_gmt("../Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt") 

#Proteomic:
reactome_pathways = sspa.process_reactome('Homo sapiens', infile = '../Data/UniProt2Reactome_All_Levels_ver84.txt', download_latest = False, filepath = None)

#Integrated:
#reactome_pathways = pd.read_csv("../Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str")


In [None]:

pathway_name_dict = {reactome_pathways.index[i]:reactome_pathways["Pathway_name"][i] for i in range(0,len(reactome_pathways))}
#pathway_name_dict['R-HSA-1483257']
pathway_name_dict = {k:pathway_name_dict[k] for k in list(G.nodes)}

nx.set_node_attributes(G, pathway_name_dict, "pathway_name")

In [None]:
#nx.write_gml(G,'../Cytoscape/metabolomic_condition_diff.gml')
#nx.write_gml(G,'../Cytoscape/proteomic_condition_diff.gml')
#nx.write_gml(G,'../Cytoscape/integrated_condition_diff.gml')

### Making a graph for the differential networks

In [11]:
import networkx as nx

In [12]:
#Reading in the edges expressed in the differential network
#with open('../Data/permutation_test_files_metabolomics/sig_edges.txt') as f:
#with open('../Data/permutation_test_files_proteomics/sig_edges.txt') as f:
with open('../Data/permutation_test_files_integrated/sig_edges.txt') as f:
    lines = f.readlines()

edges_remaining = []

edges = lines[0].split(",")

for index in range(0,len(edges),2):
    list1 = edges[index],(edges[index+1][1:]) #becomes a tuple
    edges_remaining.append(list1)

print(len(edges_remaining))
#edges_remaining

2432


In [13]:
#Reading in the edges expressed in the differential network
#with open('../Data/permutation_test_files_metabolomics/sigedge_direction.txt') as f:
#with open('../Data/permutation_test_files_proteomics/sigedge_direction.txt') as f:
with open('../Data/permutation_test_files_integrated/sigedge_direction.txt') as f:
    lines = f.readlines()
    sigedge_direction = lines[0].split(",")

print(len(sigedge_direction))

2432


In [14]:
condition_dict = {edges_remaining[i]:sigedge_direction[i] for i in range(0,len(sigedge_direction))}
print(condition_dict)

{('R-HSA-1168372', 'R-HSA-114508'): 'severe', ('R-HSA-1169408', 'R-HSA-109581'): 'severe', ('R-HSA-1169410', 'R-HSA-109581'): 'severe', ('R-HSA-1227986', 'R-HSA-112314'): 'severe', ('R-HSA-1227986', 'R-HSA-114508'): 'severe', ('R-HSA-1234174', 'R-HSA-112314'): 'severe', ('R-HSA-1234176', 'R-HSA-109704'): 'severe', ('R-HSA-1234176', 'R-HSA-112399'): 'severe', ('R-HSA-1236974', 'R-HSA-114508'): 'severe', ('R-HSA-1280215', 'R-HSA-1234176'): 'severe', ('R-HSA-1280215', 'R-HSA-1257604'): 'severe', ('R-HSA-1428517', 'R-HSA-112314'): 'severe', ('R-HSA-1428517', 'R-HSA-114508'): 'severe', ('R-HSA-1475029', 'R-HSA-1234176'): 'severe', ('R-HSA-1475029', 'R-HSA-1433559'): 'severe', ('R-HSA-156580', 'R-HSA-110362'): 'severe', ('R-HSA-156580', 'R-HSA-110373'): 'severe', ('R-HSA-165159', 'R-HSA-1280215'): 'severe', ('R-HSA-165159', 'R-HSA-1475029'): 'severe', ('R-HSA-1655829', 'R-HSA-1475029'): 'severe', ('R-HSA-166016', 'R-HSA-114508'): 'severe', ('R-HSA-166016', 'R-HSA-1226099'): 'severe', ('R-HSA

In [15]:
G=nx.from_edgelist(edges_remaining)
print(len(G.edges()))
nx.set_edge_attributes(G, condition_dict, "Condition")

2432


In [None]:
#For intersectional network only
G=nx.from_edgelist(edges_for_intersection_network)
print(len(G.edges()))
nx.set_edge_attributes(G, condition_dict, "Condition")

Run the code above that I used to construct the differential network for the condition-specific networks to add the node attributes, before running the cells underneath:

In [None]:
#nx.write_gml(G,'../Cytoscape/metabolomic_differential.gml')
#nx.write_gml(G,'../Cytoscape/proteomic_differential.gml')
#nx.write_gml(G,'../Cytoscape/integrated_differential.gml')

#No overlap between metabolomic and naive differential network,so don't need to re-do
#nx.write_gml(G,'../Cytoscape/proteomic_differential_intersect.gml') #142 nodes, 185 edges
#nx.write_gml(G,'../Cytoscape/integrated_differential_intersect.gml') #122 nodes,

In [16]:
G.number_of_nodes()

491

In [17]:
#Check if there are nodes in the differential network not in full correlation network
diff_nodes = list(G.nodes())

In [18]:
#H = nx.read_gml("../Cytoscape/metabolomic_final_commoncases.gml")
#H = nx.read_gml("../Cytoscape/proteomic_final_commoncases.gml")
H = nx.read_gml("../Cytoscape/integrated_final_commoncases.gml")
all_nodes = list(H.nodes())

In [19]:
#Confirm that the mild and severe nodes DO NOT have new nodes that the full network doesn't
print(len(all_nodes))
print(len(diff_nodes))
print(len(list(set(all_nodes).intersection(set(diff_nodes)))))

#Metabolomic differential network has 4 new nodes  
#Proteomic differential network has 1 new node 
#Integrated differential network has 11 new nodes

#Proteomic differential intersectional network has 0 new nodes  
#Integrated differential intersectional network has 1 new node

666
491
480


In [20]:
#Find nodes which are present in mild/severe network not present in full network
list(set((diff_nodes)).difference(set(all_nodes).intersection(set(diff_nodes))))

['R-HSA-381753',
 'R-HSA-193368',
 'R-HSA-83936',
 'R-HSA-5678520',
 'R-HSA-8956321',
 'R-HSA-422085',
 'R-HSA-211945',
 'R-HSA-211981',
 'R-HSA-192105',
 'R-HSA-211897',
 'R-HSA-964975']

### Comparing edges in the condition-specific networks to the differential networks:

*Must load in the code from previous section

Even filtering at alpha == 0 for the differential network, for the proteomic and integrated network this still leads to too many edges, so then I take the intersection of those edges with the ones from the condition-specific network.

In [21]:
mild_diff_edges = []
severe_diff_edges = []

for key,value in condition_dict.items():
    if value == "mild":
        mild_diff_edges.append(key)
    if value == "severe":
        severe_diff_edges.append(key)

print(len(mild_diff_edges))
print(len(severe_diff_edges))

113
2319


In [22]:
intersection1 = list(set(mild_diff_edges).intersection(list(set(mild_naive))))  
len(intersection1)

28

In [23]:
intersection2 = list(set(severe_diff_edges).intersection(list(set(severe_naive)))) 
len(intersection2) 

131

In [None]:
#FOR INTERSECTION NETWORK, remake condition dictionary since we are only interested in p-values of zero

edges_for_intersection_network = intersection1 + intersection2

mild_dict = {intersection1[i]:"Mild" for i in range(0,len(intersection1))}
severe_dict = {intersection2[i]:"Severe" for i in range(0,len(intersection2))}

condition_dict = mild_dict.copy()
for key, value in severe_dict.items():
    condition_dict[key] = value

print(condition_dict)

In [24]:
intersection = list(set(severe_diff_edges).intersection(list(set(mild_naive))))  
len(intersection)

0

In [25]:
intersection = list(set(mild_diff_edges).intersection(list(set(severe_naive))))  
len(intersection)

0

In [26]:
intersection = list(set(edges_remaining).intersection(list(set(mild_severe_naive)))) 
len(intersection) 

0

In [27]:
intersection = list(set(edges_remaining).intersection(list(set(all_edges))))   #intersection of all differential edges with all edges in the full correlation network
len(intersection) 

69

### Comparing with the significant edges BETWEEN the differential networks for all omics:

In [29]:
def edge_num (omics):
    with open('../Data/permutation_test_files_'+omics+'/sig_edges.txt') as f:
        lines = f.readlines()
    edges_remaining = []
    edges = lines[0].split(",")

    for index in range(0,len(edges),2):
        list1 = edges[index],(edges[index+1][1:]) #becomes a tuple
        edges_remaining.append(list1)

    return edges_remaining


In [30]:
proteomic_edges = edge_num('proteomics')
integrated_edges = edge_num('integrated')
metabolomic_edges = edge_num('metabolomics')

In [36]:
len(proteomic_edges)

1592

In [37]:
intersection = list(set(metabolomic_edges).intersection(list(set(proteomic_edges))))  
len(intersection) 

0

In [32]:
intersection = list(set(integrated_edges).intersection(list(set(metabolomic_edges))))  
len(intersection) 

2

In [33]:
intersection = list(set(integrated_edges).intersection(list(set(proteomic_edges))))  
len(intersection) 

993

In [34]:
intersection = (set(metabolomic_edges) & set(proteomic_edges) & set(integrated_edges))  
len(intersection) 

0

In [35]:
len(list(set(integrated_edges) - (set(metabolomic_edges)|set(proteomic_edges)))) #Pathway pairs for integrated data not detected by either single omics

1437

### Get number of common pathways between the three omics

In [46]:
import pandas as pd
import sspa

In [47]:
metabolomic_df = pd.read_csv('../Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
proteomic_df = pd.read_csv('../Data/Su_COVID_proteomics_processed_commoncases.csv', index_col=0)
integrated_df = pd.read_csv("../Data/Su_integrated_data.csv", index_col=0)

#Remove root pathways
#Convert pathway ID to name
root_path = pd.read_excel('../Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())

In [48]:
reactome_pathways = sspa.process_gmt("../Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")
kpca_scores = sspa.sspa_kpca(metabolomic_df.iloc[:,:-2], reactome_pathways)

#Using Sara's code
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))

metabolomic_pathways = kpca_scores.columns
metabolomic_pathways

Index(['R-HSA-110331', 'R-HSA-112310', 'R-HSA-112311', 'R-HSA-112315',
       'R-HSA-1237112', 'R-HSA-1368082', 'R-HSA-1368108', 'R-HSA-1428517',
       'R-HSA-1483148', 'R-HSA-1483206',
       ...
       'R-HSA-9707564', 'R-HSA-9707616', 'R-HSA-9711123', 'R-HSA-9717189',
       'R-HSA-9717207', 'R-HSA-9734207', 'R-HSA-9735804', 'R-HSA-9749641',
       'R-HSA-9753281', 'R-HSA-975634'],
      dtype='object', length=144)

In [49]:
#Load in a file downloaded from https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt
reactome_pathways = sspa.process_reactome('Homo sapiens', infile = '../Data/UniProt2Reactome_All_Levels_ver84.txt', download_latest = False, filepath = None)
kpca_scores = sspa.sspa_kpca(proteomic_df.iloc[:,:-2], reactome_pathways)

#Using Sara's code
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))

proteomic_pathways = kpca_scores.columns
proteomic_pathways

Index(['R-HSA-109581', 'R-HSA-109606', 'R-HSA-109704', 'R-HSA-110362',
       'R-HSA-110373', 'R-HSA-111465', 'R-HSA-112314', 'R-HSA-112315',
       'R-HSA-112399', 'R-HSA-114508',
       ...
       'R-HSA-9772573', 'R-HSA-9818027', 'R-HSA-9824439', 'R-HSA-9824443',
       'R-HSA-9824446', 'R-HSA-983168', 'R-HSA-983169', 'R-HSA-983231',
       'R-HSA-983695', 'R-HSA-983705'],
      dtype='object', length=578)

In [50]:
reactome_pathways = pd.read_csv("../Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str")
kpca_scores = sspa.sspa_kpca(integrated_df.iloc[:,:-2], reactome_pathways)

#Using Sara's code
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))

integrated_pathways = kpca_scores.columns
integrated_pathways

Index(['R-HSA-109581', 'R-HSA-109606', 'R-HSA-109704', 'R-HSA-110331',
       'R-HSA-110362', 'R-HSA-110373', 'R-HSA-111465', 'R-HSA-112310',
       'R-HSA-112311', 'R-HSA-112314',
       ...
       'R-HSA-9818027', 'R-HSA-9824439', 'R-HSA-9824443', 'R-HSA-9824446',
       'R-HSA-983168', 'R-HSA-983169', 'R-HSA-983231', 'R-HSA-983695',
       'R-HSA-983705', 'R-HSA-983712'],
      dtype='object', length=710)

In [51]:
intersection = list(set(metabolomic_pathways).intersection(list(set(proteomic_pathways))))   #47 pathways
intersection = (set(metabolomic_pathways) & set(proteomic_pathways) & set(integrated_pathways))  #47 pathways 

intersection = list(set(metabolomic_pathways).intersection(list(set(integrated_pathways))))  #144 pathways (same as number of metabolomic)
intersection = list(set(proteomic_pathways).intersection(list(set(integrated_pathways))))  #578 pathways (same as number of proteomic)

#710 pathways in integrated dataset
#((144+578)-47+35)

len(list(set(integrated_pathways) - (set(metabolomic_pathways)|set(proteomic_pathways)))) #35 pathways for integrated data not detected by either single omics

35

In [52]:
new_pathways = list(set(integrated_pathways) - (set(metabolomic_pathways)|set(proteomic_pathways)))
new_pathways

['R-HSA-379401',
 'R-HSA-71240',
 'R-HSA-193993',
 'R-HSA-975576',
 'R-HSA-379397',
 'R-HSA-209968',
 'R-HSA-373760',
 'R-HSA-202131',
 'R-HSA-2162123',
 'R-HSA-5579029',
 'R-HSA-9660821',
 'R-HSA-8939902',
 'R-HSA-187015',
 'R-HSA-379398',
 'R-HSA-5652084',
 'R-HSA-9694631',
 'R-HSA-203615',
 'R-HSA-9734735',
 'R-HSA-8868773',
 'R-HSA-983712',
 'R-HSA-70350',
 'R-HSA-9755088',
 'R-HSA-1483166',
 'R-HSA-901042',
 'R-HSA-9634815',
 'R-HSA-532668',
 'R-HSA-2672351',
 'R-HSA-196807',
 'R-HSA-8978934',
 'R-HSA-194002',
 'R-HSA-389661',
 'R-HSA-975578',
 'R-HSA-73942',
 'R-HSA-1483249',
 'R-HSA-193681']

In [74]:
#Get number of pathways that have been influenced

df = pd.read_csv("../Data/Su_integrated_data.csv", index_col=0)
reactome_pathways = pd.read_csv("../Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str")


kpca_scores = sspa.sspa_kpca(integrated_df.iloc[:,:-2], reactome_pathways)

#Using Sara's code
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))

#Obtain pathways and corresponding metabolites for all Reactome pathways, store as dictionary
orig_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

#Filter out dictionary to retain only the pathways that remain after kPCA
my_keys = kpca_scores.columns
pathways_dict = {key: orig_dict[key] for key in my_keys}

#Filter out the compounds in the pathways that are not present in the dataset

#Obtain all unique values in dataset
compounds_present = list(df.columns[:-2])
filtered_dict = {} 

#My code adapted from Cecilia's
#If the key values are not part of the compounds in dataset then remove
for key,value in pathways_dict.items():
    new_val = [item for item in value if item in compounds_present]
    if len(new_val) >= 2: #at least two compounds in the pathway
        filtered_dict[key] = new_val

In [None]:
new_pathways

In [75]:
#All 35 new pathways comprised of one metabolite and one protein

proteomic_only_pathway = []
proteomic_pathway_with_onemetabolite = []
metabolomic_only_pathway = []
metabolomic_pathway_with_oneprotein = []
both_omic = []
new_pathway = []

for key,values in filtered_dict.items():
    #if key in new_pathways:
        #print(values)
    #print(values)
    proteomic=False
    metabolomic=False
    both=False


    proteins = 0
    metabolites = 0

    for value in values:

        if value[0].isalpha():
            proteins += 1
        else:
            metabolites += 1

    
    if metabolites == 0:
        proteomic_only_pathway.append(key)
    if proteins > 1 and metabolites == 1:
        proteomic_pathway_with_onemetabolite.append(key)
    
    if proteins == 0:
        metabolomic_only_pathway.append(key)
    if metabolites > 1 and proteins == 1:
        metabolomic_pathway_with_oneprotein.append(key)

    if metabolites > 1 and proteins > 1:
        both_omic.append(key)
    if proteins == 1 and metabolites == 1:
        new_pathway.append(key)



In [76]:
print(len(proteomic_only_pathway))
print(len(proteomic_pathway_with_onemetabolite))
print(len(metabolomic_only_pathway))
print(len(metabolomic_pathway_with_oneprotein))
print(len(both_omic))
print(len(new_pathway))

485
46
65
32
47
35


### Comparing the initial test statistics of the three omics to see which edges were affected

In [28]:
import pandas as pd

Comparing the initial test statistics of the integrated data with the proteomic initial test statistics:

In [38]:
#Load dataset
metabolomic = pd.read_csv('../Data/permutation_test_files_metabolomics/initial_tstats.csv', index_col=0)
proteomic = pd.read_csv('../Data/permutation_test_files_proteomics/initial_tstats.csv', index_col=0)
integrated = pd.read_csv('../Data/permutation_test_files_integrated/initial_tstats.csv', index_col=0)

In [39]:
metabolomic = metabolomic.rename(columns={'Initial_tstat': 'metabolomic_tstat'})
proteomic = proteomic.rename(columns={'Initial_tstat': 'proteomic_tstat'})
integrated = integrated.rename(columns={'Initial_tstat': 'integrated_tstat'})

In [40]:
initial_tstat_comp = proteomic.join(integrated)
initial_tstat_comp

Unnamed: 0_level_0,proteomic_tstat,integrated_tstat
Edges,Unnamed: 1_level_1,Unnamed: 2_level_1
"R-HSA-109606, R-HSA-109581",-0.267043,-0.267043
"R-HSA-109704, R-HSA-109581",-0.423681,-0.423681
"R-HSA-109704, R-HSA-109606",-0.332380,-0.332380
"R-HSA-110362, R-HSA-109581",-0.090820,-0.090820
"R-HSA-110362, R-HSA-109606",-0.031440,-0.031440
...,...,...
"R-HSA-983705, R-HSA-9824446",0.048487,0.041855
"R-HSA-983705, R-HSA-983168",-0.231214,-0.231214
"R-HSA-983705, R-HSA-983169",-0.029522,-0.029522
"R-HSA-983705, R-HSA-983231",-0.246918,-0.246918


In [41]:
#Number of pathway pairs with initial test statistic that are the same

proteomic_only_edge = []
metabolomic_influence_edge = []
counter = 0

for edge in initial_tstat_comp.index:
    if initial_tstat_comp.loc[edge].proteomic_tstat == initial_tstat_comp.loc[edge].integrated_tstat:
        proteomic_only_edge.append(edge)
        counter += 1
    else:
        metabolomic_influence_edge.append(edge)


print(counter)

#166,753 edges of the proteomic initial test statistic dataset are all included in the integrated initial test statistic data
#117,381 edges out of 166,753 are identical
#Therefore 49,372 edges were influenced by the metabolomic data

len(metabolomic_influence_edge)

117381


49372

In [42]:
#Change to tuple format so you can compare with the significant edges

metabolomic_influence_edge_tuple = []
for i in range(len(metabolomic_influence_edge)):
    edge = metabolomic_influence_edge[i].split(",")
    edge = (edge[0],edge[1][1:])
    #edge = tuple(sorted(edge))
    metabolomic_influence_edge_tuple.append(edge)


proteomic_only_edge_tuple  = []
for i in range(len(proteomic_only_edge )):
    edge = proteomic_only_edge [i].split(",")
    edge = (edge[0],edge[1][1:])
    #edge = tuple(sorted(edge))
    proteomic_only_edge_tuple.append(edge)

In [43]:
integrated_edge = integrated.index

integrated_edge_tuple  = []
for i in range(len(integrated_edge)):
    edge = integrated_edge [i].split(",")
    edge = (edge[0],edge[1][1:])
    #edge = tuple(sorted(edge))
    integrated_edge_tuple.append(edge)



In [64]:
#Need to see proportion of metabolomic-influenced edges that are significantly differentially expressed edges (from differential network) in the proteomic data vs the integrated data
print(len(metabolomic_influence_edge))
print(len(list(set(proteomic_edges).intersection(set(metabolomic_influence_edge_tuple))))) #680 edges significant
print(len(list(set(integrated_edges).intersection(set(metabolomic_influence_edge_tuple))))) #After integrating with the metabolomic data, 765 significant


#290 edges are the same between 680 edges and 765 edges
list1 = list(set(proteomic_edges).intersection(set(metabolomic_influence_edge_tuple)))
list2 = list(set(integrated_edges).intersection(set(metabolomic_influence_edge_tuple)))
len(list(set(list1).intersection(set(list2))))

49372
680
765


290

In [65]:
#Showing that 'Metabolomic influenced' proteomic pathways that are significant in the differential network are instances where at least one pathway is a common pathway OR one pathway is influenced by one metabolite (not enough to be considered a pathway in the metabolomic set)

list1 = list(set(proteomic_edges).intersection(set(metabolomic_influence_edge_tuple)))

common_pathways = list(set(proteomic_pathways) & set(metabolomic_pathways))
print(len(common_pathways))
counter = 0 

influenced_noncommon_pathway_pair = []
for edges in list1:
    pathway_pair = list(edges)
    if (pathway_pair[0] in common_pathways) or (pathway_pair[1] in common_pathways):
        #print(pathway_pair)
        counter += 1
    else:
        influenced_noncommon_pathway_pair.append(edges)


print(counter)
#1371 pathway pairs have one common pathway in them, out of the differentially expressed protein pathways in the network that are influenced by the metabolomics dataset
#The other cases are where there is one metabolite added to one of the pathways that is not enough to be constituted as a pathway in the metabolomics dataset

47
327


In [69]:
#We can look at the pathways before and after integration to prove that for the non-common pathways that are influenced by the other omics, there is only one analyte from the other dataset added to it


#df = pd.read_csv('../Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
#reactome_pathways = sspa.process_gmt("../Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

df = pd.read_csv("../Data/Su_integrated_data.csv", index_col=0)
reactome_pathways = pd.read_csv("../Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str")


kpca_scores = sspa.sspa_kpca(integrated_df.iloc[:,:-2], reactome_pathways)

#Using Sara's code
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))

#Obtain pathways and corresponding metabolites for all Reactome pathways, store as dictionary
orig_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

#Filter out dictionary to retain only the pathways that remain after kPCA
my_keys = kpca_scores.columns
pathways_dict = {key: orig_dict[key] for key in my_keys}

#Filter out the compounds in the pathways that are not present in the dataset

#Obtain all unique values in dataset
compounds_present = list(df.columns[:-2])
filtered_dict = {} 

#My code adapted from Cecilia's
#If the key values are not part of the compounds in dataset then remove
for key,value in pathways_dict.items():
    new_val = [item for item in value if item in compounds_present]
    if len(new_val) >= 2: #at least two compounds in the pathway
        filtered_dict[key] = new_val

In [63]:
print(influenced_noncommon_pathway_pair)
list(reactome_pathways.loc['R-HSA-422475'])  #All proteins, only one metabolite
filtered_dict['R-HSA-422475']

[('R-HSA-9694516', 'R-HSA-1236974'), ('R-HSA-73857', 'R-HSA-1234176'), ('R-HSA-975110', 'R-HSA-1280215'), ('R-HSA-912631', 'R-HSA-212436'), ('R-HSA-9694635', 'R-HSA-70268'), ('R-HSA-8943724', 'R-HSA-73857'), ('R-HSA-73857', 'R-HSA-5685939'), ('R-HSA-199977', 'R-HSA-114508'), ('R-HSA-450302', 'R-HSA-212436'), ('R-HSA-983168', 'R-HSA-9664433'), ('R-HSA-5668541', 'R-HSA-114508'), ('R-HSA-3371497', 'R-HSA-2555396'), ('R-HSA-9662851', 'R-HSA-5654699'), ('R-HSA-2029482', 'R-HSA-1280215'), ('R-HSA-73857', 'R-HSA-70268'), ('R-HSA-9680350', 'R-HSA-3000171'), ('R-HSA-1280215', 'R-HSA-1169410'), ('R-HSA-9694635', 'R-HSA-194315'), ('R-HSA-2990846', 'R-HSA-112314'), ('R-HSA-73857', 'R-HSA-5693538'), ('R-HSA-9664433', 'R-HSA-1839126'), ('R-HSA-3000178', 'R-HSA-209776'), ('R-HSA-1280215', 'R-HSA-109606'), ('R-HSA-9694516', 'R-HSA-5693532'), ('R-HSA-5633007', 'R-HSA-1280215'), ('R-HSA-9664433', 'R-HSA-5654221'), ('R-HSA-975110', 'R-HSA-114508'), ('R-HSA-1236974', 'R-HSA-114508'), ('R-HSA-9664433', 'R-

['Q02246',
 'P48061',
 'P25786',
 'P12931',
 'Q5T4W7',
 '15756',
 'Q8N8S7',
 'Q04759',
 'P07947',
 'Q04637',
 'P39905',
 'Q9HCM2',
 'P07332',
 'P20936',
 'Q99748']

Comparing the initial test statistics of the integrated data with the metabolomic initial test statistics:

In [70]:
initial_tstat_comp = metabolomic.join(integrated)
initial_tstat_comp

Unnamed: 0_level_0,metabolomic_tstat,integrated_tstat
Edges,Unnamed: 1_level_1,Unnamed: 2_level_1
"R-HSA-112310, R-HSA-110331",0.001773,0.001773
"R-HSA-112311, R-HSA-110331",-0.038910,0.011261
"R-HSA-112311, R-HSA-112310",-0.144006,-0.012691
"R-HSA-112315, R-HSA-110331",0.014812,0.018150
"R-HSA-112315, R-HSA-112310",-0.027926,0.728299
...,...,...
"R-HSA-975634, R-HSA-9717207",0.067177,0.108570
"R-HSA-975634, R-HSA-9734207",0.023737,-0.048013
"R-HSA-975634, R-HSA-9735804",0.002574,-0.069612
"R-HSA-975634, R-HSA-9749641",0.015996,-0.000264


In [71]:
#Number of pathway pairs with initial test statistic that are the same
metabolomic_only_edge = []
proteomic_influence_edge = []

counter = 0
for edge in initial_tstat_comp.index:
    if initial_tstat_comp.loc[edge].metabolomic_tstat == initial_tstat_comp.loc[edge].integrated_tstat:
        metabolomic_only_edge.append(edge)
        counter += 1
    else:
        proteomic_influence_edge.append(edge)

print(counter)

#10,296 edges of the metabolomic initial test statistic dataset are all included in the integrated initial test statistic data
#2,088 edges out of 10,296 are identical
#Therefore 8,208 edges were influenced by the proteomic data

proteomic_influence_edge

2088


['R-HSA-112311, R-HSA-110331',
 'R-HSA-112311, R-HSA-112310',
 'R-HSA-112315, R-HSA-110331',
 'R-HSA-112315, R-HSA-112310',
 'R-HSA-112315, R-HSA-112311',
 'R-HSA-1237112, R-HSA-112311',
 'R-HSA-1237112, R-HSA-112315',
 'R-HSA-1368082, R-HSA-112311',
 'R-HSA-1368082, R-HSA-112315',
 'R-HSA-1368108, R-HSA-112311',
 'R-HSA-1368108, R-HSA-112315',
 'R-HSA-1428517, R-HSA-110331',
 'R-HSA-1428517, R-HSA-112310',
 'R-HSA-1428517, R-HSA-112311',
 'R-HSA-1428517, R-HSA-112315',
 'R-HSA-1428517, R-HSA-1237112',
 'R-HSA-1428517, R-HSA-1368082',
 'R-HSA-1428517, R-HSA-1368108',
 'R-HSA-1483148, R-HSA-112311',
 'R-HSA-1483148, R-HSA-112315',
 'R-HSA-1483148, R-HSA-1428517',
 'R-HSA-1483206, R-HSA-110331',
 'R-HSA-1483206, R-HSA-112310',
 'R-HSA-1483206, R-HSA-112311',
 'R-HSA-1483206, R-HSA-112315',
 'R-HSA-1483206, R-HSA-1237112',
 'R-HSA-1483206, R-HSA-1368082',
 'R-HSA-1483206, R-HSA-1368108',
 'R-HSA-1483206, R-HSA-1428517',
 'R-HSA-1483206, R-HSA-1483148',
 'R-HSA-1483255, R-HSA-110331',
 'R-

In [72]:
proteomic_influence_edge_tuple = []
for i in range(len(proteomic_influence_edge)):
    edge = proteomic_influence_edge[i].split(",")
    edge = (edge[0],edge[1][1:])
    #edge = tuple(sorted(edge))
    proteomic_influence_edge_tuple.append(edge)


metabolomic_only_edge_tuple = []
for i in range(len(metabolomic_only_edge)):
    edge = metabolomic_only_edge[i].split(",")
    edge = (edge[0],edge[1][1:])
    #edge = tuple(sorted(edge))
    metabolomic_only_edge_tuple.append(edge)

In [73]:
#Need to see proportion of proteomic-influenced edges that are significant edges in the metabolomic data vs the integrated data
print(len(proteomic_influence_edge))
print(len(list(set(metabolomic_edges).intersection(set(proteomic_influence_edge_tuple))))) #17 edges significant
print(len(list(set(integrated_edges).intersection(set(proteomic_influence_edge_tuple))))) #After integrating with the proteomic data, 45 significant

#2 edges are the same between 17 edges and 45 edges
list1 = list(set(metabolomic_edges).intersection(set(proteomic_influence_edge_tuple)))
list2 = list(set(integrated_edges).intersection(set(proteomic_influence_edge_tuple)))
len(list(set(list1).intersection(set(list2))))

8208
17
45


2

### Testing the integrated only pathways

In [77]:
integrated_only = list(set(integrated_edges) - (set(metabolomic_edges)|set(proteomic_edges)))
len(integrated_only)

1437

Common pathway | Any            =   857 - 61 = 796 (account for overlap with new pathway)   <br>

New integrated pathways: <br>
New pathway | (Any)   =   669  <br>


Metabolomic-influenced metabolomic pathway | (Any)  =  26    <br>
Proteomic-influenced metabolomic pathway | (Any)  =  272    <br>

Pathway only in metabolomics | Pathway only in proteomics   =   676   <br>

False positives: <br>
Pathway only in metabolomics | Pathway only in metabolomics   =   1  <br>
Pathway only in proteomics | Pathway only in proteomics   =   95  <br>




In [78]:
print(len(proteomic_only_pathway))
print(len(proteomic_pathway_with_onemetabolite))
print(len(metabolomic_only_pathway))
print(len(metabolomic_pathway_with_oneprotein))
print(len(both_omic))
print(len(new_pathway))

485
46
65
32
47
35


In [79]:
#Pathways where one is a common pathway

counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in both_omic) or (pathway_pair[1] in both_omic):
        #print(pathway_pair)
        counter += 1

print(counter)

382


In [80]:
#Pathways where one is a common pathway and one is a new pathway
 
counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in both_omic) and (pathway_pair[1] in new_pathway):
        counter += 1
    if (pathway_pair[1] in both_omic) and (pathway_pair[0] in new_pathway):
        counter += 1

print(counter)

23


In [81]:
#Pathways where one is a new pathway

counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in new_pathway) or (pathway_pair[1] in new_pathway):
        counter += 1

print(counter)

203


In [86]:
#Pathways where both are the same

counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in metabolomic_only_pathway) and (pathway_pair[1] in metabolomic_only_pathway):
       counter += 1
    #if (pathway_pair[0] in proteomic_only_pathway) and (pathway_pair[1] in proteomic_only_pathway):
    #    counter += 1
    #if (pathway_pair[0] in metabolomic_pathway_with_oneprotein) and (pathway_pair[1] in metabolomic_pathway_with_oneprotein):
    #    counter += 1
    if (pathway_pair[0] in proteomic_pathway_with_onemetabolite) and (pathway_pair[1] in proteomic_pathway_with_onemetabolite):
        counter += 1

print(counter)

8


In [87]:
counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in proteomic_pathway_with_onemetabolite) and (pathway_pair[1] in proteomic_only_pathway):
        counter += 1
    if (pathway_pair[1] in proteomic_pathway_with_onemetabolite) and (pathway_pair[0] in proteomic_only_pathway):
        counter += 1

print(counter)

121


In [88]:

counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in metabolomic_pathway_with_oneprotein) and (pathway_pair[1] in metabolomic_only_pathway):
        counter += 1
    if (pathway_pair[1] in metabolomic_pathway_with_oneprotein) and (pathway_pair[0] in metabolomic_only_pathway):
        counter += 1

print(counter)

7


In [89]:

counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in metabolomic_pathway_with_oneprotein) and (pathway_pair[1] in proteomic_only_pathway):
        counter += 1
    if (pathway_pair[1] in metabolomic_pathway_with_oneprotein) and (pathway_pair[0] in proteomic_only_pathway):
        counter += 1

print(counter)

70


In [90]:

counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in proteomic_pathway_with_onemetabolite) and (pathway_pair[1] in metabolomic_only_pathway):
        counter += 1
    if (pathway_pair[1] in proteomic_pathway_with_onemetabolite) and (pathway_pair[0] in metabolomic_only_pathway):
        counter += 1

print(counter)

20


In [91]:
counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in proteomic_pathway_with_onemetabolite) and (pathway_pair[1] in metabolomic_pathway_with_oneprotein):
        counter += 1
    if (pathway_pair[1] in proteomic_pathway_with_onemetabolite) and (pathway_pair[0] in metabolomic_pathway_with_oneprotein):
        counter += 1

print(counter)

15


In [92]:


counter = 0 

for edges in integrated_only:
    pathway_pair = list(edges)
    if (pathway_pair[0] in proteomic_only_pathway) and (pathway_pair[1] in metabolomic_only_pathway):
        #print(pathway_pair)
        counter += 1
    if (pathway_pair[0] in metabolomic_only_pathway) and (pathway_pair[1] in proteomic_only_pathway):
        #print(pathway_pair)
        counter += 1

print(counter)

48


In [93]:
#Probably false postives:  
#Proteomic influenced only (i.e. pairs where both pathways are only in the proteomic dataset)
print(len(list(set(proteomic_only_edge_tuple).intersection(set(integrated_only)))))
#Metabolomic influenced only (i.e. pairs where both pathways are only in the metabolomic dataset)
print(len(list(set(metabolomic_only_edge_tuple).intersection(set(integrated_only)))))

586
0


In [94]:
#Check the significance of the 'false positives' IN THE INTEGRATED DATASET
proteomic_false_pos = list(set(proteomic_only_edge_tuple).intersection(set(integrated_only)))
metabolomic_false_pos = list(set(metabolomic_only_edge_tuple).intersection(set(integrated_only)))

In [109]:
import os 
#import numpy as np 
import statsmodels.stats.multitest
#import pandas as pd
from itertools import compress

#import seaborn as sns
#import matplotlib.pyplot as plt

In [103]:
path = os.getcwd()[:-13] + '\\Data\\permutation_test_files_integrated\\Values'

val_array = []


for filename in os.listdir(path): #also lists directories
    with open(os.path.join(path, filename)) as file:    
        lines = file.readlines()
        vals = lines[0].split(';')
        vals =  [int(x) for x in vals]
        #print(vals)
        if filename == 'vals1.txt':
            val_array = np.array(vals)
        else:
            val_array = np.vstack([val_array, vals])

val_array
val_array2 = val_array.sum(axis=0) #add up the values by columns for each pathway pair
pval_array = val_array2 / 100000


In [106]:
sig_edge_boolean,adjusted_pval = statsmodels.stats.multitest.fdrcorrection(pval_array, alpha=0, method='poscorr', is_sorted=False)

In [115]:
df = pd.read_csv('../Data/permutation_test_files_proteomics/initial_tstats.csv', index_col=0)
edgelist = df.index


sig_edges = list(compress(edgelist,sig_edge_boolean))


#Zip p-values to edge names to form dictionary (so I can test significance of differentially expressed genes)
edgelist_tuple = []
for i in range(len(edgelist)):
    edge = edgelist[i].split(",")
    edge = (edge[0],edge[1][1:])
    edgelist_tuple.append(edge)
    
p_val_dict  = dict(zip(edgelist_tuple, adjusted_pval))


In [116]:
proteomic_false_pos_pval = []
metabolomic_false_pos_pval = []

for edge in proteomic_false_pos:
    if edge in list(p_val_dict.keys()):
        proteomic_false_pos_pval.append(p_val_dict[edge])

#for edge in metabolomic_false_pos:
#    if edge in list(p_val_dict.keys()):
#        metabolomic_false_pos_pval.append(p_val_dict[edge])

proteomic_false_pos_pval[:100]
#len(proteomic_false_pos_pval)

#print(metabolomic_false_pos_pval) #[0.000747091124962897] (not that close to threshold)
sum(proteomic_false_pos_pval)/len(proteomic_false_pos_pval)  #average = 0.0045  #close to p < 0.05 threshold

0.3132527100381446

In [121]:
df.loc[edgelist]

Unnamed: 0_level_0,Initial_tstat
Edges,Unnamed: 1_level_1
"R-HSA-109606, R-HSA-109581",-0.267043
"R-HSA-109704, R-HSA-109581",-0.423681
"R-HSA-109704, R-HSA-109606",-0.332380
"R-HSA-110362, R-HSA-109581",-0.090820
"R-HSA-110362, R-HSA-109606",-0.031440
...,...
"R-HSA-983705, R-HSA-9824446",0.048487
"R-HSA-983705, R-HSA-983168",-0.231214
"R-HSA-983705, R-HSA-983169",-0.029522
"R-HSA-983705, R-HSA-983231",-0.246918


In [123]:
df = pd.read_csv('../Data/permutation_test_files_integrated/initial_tstats.csv', index_col=0)
df.loc[edgelist]

Unnamed: 0_level_0,Initial_tstat
Edges,Unnamed: 1_level_1
"R-HSA-109606, R-HSA-109581",-0.267043
"R-HSA-109704, R-HSA-109581",-0.423681
"R-HSA-109704, R-HSA-109606",-0.332380
"R-HSA-110362, R-HSA-109581",-0.090820
"R-HSA-110362, R-HSA-109606",-0.031440
...,...
"R-HSA-983705, R-HSA-9824446",0.041855
"R-HSA-983705, R-HSA-983168",-0.231214
"R-HSA-983705, R-HSA-983169",-0.029522
"R-HSA-983705, R-HSA-983231",-0.246918


In [117]:
proteomic_false_pos_pval

[0.48116155255997894,
 0.452539213996598,
 0.3530192408017089,
 0.5019235598157846,
 0.5021408403650199,
 0.3627973350104706,
 0.019964200038692207,
 0.09025698582402789,
 0.452325096762596,
 0.19886303450294762,
 0.5328831243764276,
 0.03243863844655745,
 0.17647949790115977,
 0.454855762514227,
 0.4348221233999307,
 0.5252758798464877,
 0.5158479845432442,
 0.09023141148514853,
 0.23917196071652763,
 0.3753737229740187,
 0.4521891616766467,
 0.1261641344383057,
 0.12010760654774622,
 0.21249840332458442,
 0.059516672678449754,
 0.5514093931884275,
 0.08179887115968021,
 0.03599799446191897,
 0.5153414398895442,
 0.4584301098469988,
 0.4620430074098379,
 0.17966481293263006,
 0.4474918738534462,
 0.3877053701482166,
 0.47491673143350605,
 0.5039922224342831,
 0.45838336796154056,
 0.06897146582915495,
 0.05237665217671996,
 0.008366901824628795,
 0.40517490554750957,
 0.05534593794459787,
 0.19207150780502363,
 0.3931106587593129,
 0.49528711804194436,
 0.4671364524268084,
 0.30545280

In [114]:
proteomic_false_pos

[('R-HSA-450531', 'R-HSA-195258'),
 ('R-HSA-5675482', 'R-HSA-453279'),
 ('R-HSA-1839130', 'R-HSA-166058'),
 ('R-HSA-975144', 'R-HSA-8941326'),
 ('R-HSA-5654687', 'R-HSA-168142'),
 ('R-HSA-5654693', 'R-HSA-168181'),
 ('R-HSA-5693538', 'R-HSA-5357769'),
 ('R-HSA-5654688', 'R-HSA-168138'),
 ('R-HSA-5654706', 'R-HSA-166058'),
 ('R-HSA-975155', 'R-HSA-190373'),
 ('R-HSA-5654695', 'R-HSA-168176'),
 ('R-HSA-190375', 'R-HSA-168188'),
 ('R-HSA-190241', 'R-HSA-168138'),
 ('R-HSA-3595174', 'R-HSA-1236974'),
 ('R-HSA-9716542', 'R-HSA-6811558'),
 ('R-HSA-5358351', 'R-HSA-418990'),
 ('R-HSA-199991', 'R-HSA-112314'),
 ('R-HSA-69202', 'R-HSA-5213460'),
 ('R-HSA-446652', 'R-HSA-1169410'),
 ('R-HSA-5654227', 'R-HSA-168138'),
 ('R-HSA-168138', 'R-HSA-1226099'),
 ('R-HSA-5689896', 'R-HSA-168138'),
 ('R-HSA-5655332', 'R-HSA-166058'),
 ('R-HSA-1169410', 'R-HSA-109581'),
 ('R-HSA-190375', 'R-HSA-168176'),
 ('R-HSA-168179', 'R-HSA-1226099'),
 ('R-HSA-975155', 'R-HSA-5654227'),
 ('R-HSA-5654704', 'R-HSA-168188