This script is used to 1.) construct the naive difference network, 2.) construct the differential network, 3.) construct a consensus network from the edges that are significant in both the naive difference network, which also can be used to compare the two methods.

### Comparing condition-specific networks (mild vs severe) with the full correlation results and then constructing the naive difference network

In [None]:
#Load libraries 
import networkx as nx
import pandas as pd
import numpy as np
import sspa

In [None]:
#Input is either: metabolomic, proteomic, integrated
#FOR THE EDGES MUST SORT OUT INTO TUPLES AS SOME TUPLES ARE THE SAME BUT ARE IN A DIFFERENT ORDER
#i.e.('R-HSA-192456', 'R-HSA-112315') and ('R-HSA-112315', 'R-HSA-192456')

full_network = nx.read_gml("../Cytoscape/proteomic_final_commoncases.gml")
all_nodes = list(G.nodes())
all_edges = list(G.edges())
all_edges = [tuple(sorted(tuple1)) for tuple1 in all_edges]


mild_network = nx.read_gml("../Cytoscape/proteomic_final_mildcases.gml")
mild_nodes = list(mild_network.nodes())
mild_edges = list(mild_network.edges())
mild_edges =  [tuple(sorted(tuple1)) for tuple1 in mild_edges]


severe_network = nx.read_gml("../Cytoscape/proteomic_final_severecases.gml")
severe_nodes = list(severe_network.nodes())
severe_edges = list(severe_network.edges())
severe_edges = [tuple(sorted(tuple1)) for tuple1 in severe_edges]

In [None]:
#FOR THE EDGES MUST SORT OUT INTO TUPLES AS SOME TUPLES ARE THE SAME BUT ARE IN A DIFFERENT ORDER
#i.e.('R-HSA-192456', 'R-HSA-112315') and ('R-HSA-112315', 'R-HSA-192456')

#This code can be used to check which edges are the same but in a different order
#Shouldn't be a problem now as the edges have been sorted as tuples
for i in severe_edges:
    for j in mild_edges:
        edges_i = list(i)
        edges_j = list(j)
        if edges_i[0] == edges_j[1] and edges_j[0] == edges_i[1]:
            print(edges_i, edges_j)

In [None]:
#Confirm that the mild and severe nodes DO NOT have new nodes that the full network doesn't
print("Number of nodes in full correlation network: ", len(all_nodes))
print("Number of nodes in mild network: ", len(mild_nodes))
print("Number of nodes in severe network: ", len(severe_nodes))
print("Number of nodes in both mild network and full correlation network: ", len(list(set(all_nodes).intersection(set(mild_nodes)))))
print("Number of nodes in both severe network and full correlation network: ",len(list(set(all_nodes).intersection(set(severe_nodes)))))

#Metabolomic severe network has 3 new nodes  #{'R-HSA-211859', 'R-HSA-5663205', 'R-HSA-9734207'}
#Proteomic same nodes for all
#Integrated mild network has 4 new nodes and severe network has 3 new nodes  #{'R-HSA-192105', 'R-HSA-193368', 'R-HSA-211976', 'R-HSA-5619084'} {'R-HSA-174824', 'R-HSA-83936', 'R-HSA-8956321'} 

In [None]:
#Find nodes which are present in mild/severe network not present in full network
print(set((mild_nodes)).difference(set(all_nodes).intersection(set(mild_nodes))))
print(set((severe_nodes)).difference(set(all_nodes).intersection(set(severe_nodes))))

In [None]:
#Print out number of edges
print("Number of edges in full correlation network: ",len(all_edges))
print("Number of edges in mild network: ", len(mild_edges))
print("Number of edges in severe network: ", len(severe_edges))

In [None]:
#Example code from https://stackoverflow.com/questions/41125909/python-find-elements-in-one-list-that-are-not-in-the-other
list_1=["a", "b", "c", "d", "e"]
list_2=["a", "f", "c", "m"]
set(list_2) - set(list_1)

#set(['f', 'm'])

In [None]:
#Edges present in the mild network but not the severe network
print(len(list(set(mild_edges) - set(severe_edges))))
mild_naive = list(set(mild_edges) - set(severe_edges))

In [None]:
#Edges present in the severe network but not the mild network
print(len(list(set(severe_edges) - set(mild_edges))))
severe_naive = list(set(severe_edges) - set(mild_edges))

In [None]:
#Edges present in both mild and severe
print(len(list(set(severe_edges) & set(mild_edges))))
mild_severe_naive = list(set(severe_edges) & set(mild_edges))

In [None]:
print(len(list(set(mild_naive) - set(all_edges)))) #Edges in the mild network (but not severe network) not in the full network         

print(len(list(set(severe_naive) - set(all_edges)))) #Edges in the severe network (but not mild network) not in the full network 

print(len(list(set(mild_severe_naive) - set(all_edges)))) #Edges in the mild network AND severe network not in the full network

print(len(all_edges)) #edges in full correlation network

In [None]:
#Create a dictionary in which the naive difference network edges are classed as mild or severe
mild_dict = {mild_naive[i]:"Mild" for i in range(0,len(mild_naive))}
severe_dict = {severe_naive[i]:"Severe" for i in range(0,len(severe_naive))}

condition_dict = mild_dict.copy()
for key, value in severe_dict.items():
    condition_dict[key] = value

print(condition_dict)

In [None]:
len(condition_dict)

In [None]:
#Create the naive difference network
G=nx.from_edgelist(mild_naive+severe_naive)
print(len(G.edges()))
nx.set_edge_attributes(G, condition_dict, "Condition")

Add the node attributes here:

In [None]:
#Must filter out some of the edges first

#Add the betweenness centrality as a node attribute
betweenness= nx.betweenness_centrality(G, normalized=True) #output as dictionary
#display(betweenness)
nx.set_node_attributes(G, betweenness, "betweenness")

#Add the betweenness centrality as a node attribute
degree= nx.degree_centrality(G) #output as dictionary
#display(degree)
nx.set_node_attributes(G, degree, "degrees")


#Check
print(G.number_of_nodes()) 
#G.nodes['R-HSA-110331']#["betweenness"]

In [None]:
#Using Cecilia's code

#Creating graph
hierarchy = pd.read_csv('../Data/ReactomePathwaysRelation.txt', sep='\t', header=None)

#From the pathways, subset to Homo sapiens only
hierarchy_hsa = hierarchy[hierarchy[0].str.contains('HSA')]

#Return unique values in the first column that is not in the second column as a numpy array
#These values are not child pathways in any instances
hierarchy_hsa_parents = np.setdiff1d(hierarchy_hsa[0], hierarchy_hsa[1])

#Add the unique values not in the second column as a second attached dataset to the bottom of the original data
#The first column represents the parent column, the second column is the child column
hierarchy_hsa_all = pd.concat([hierarchy_hsa, pd.DataFrame([hierarchy_hsa_parents, hierarchy_hsa_parents], index=[0, 1]).T])

#DiGraph is a directed graph
H = nx.from_pandas_edgelist(hierarchy_hsa, source=0, target=1, create_using=nx.DiGraph())


In [None]:
#Convert pathway ID to name
root_path = pd.read_excel('../Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}

#Using Cecilia's code

#Find the root pathway

def find_root(H,child):
    #Find parent from child 
    parent = list(H.predecessors(child))

    #Keep the loop going until the highest level is reached
    if len(parent) == 0:
        return child
    else:  
        return find_root(H, parent[0])

hierarchy_hsa_all['Root'] = [find_root(H, i) for i in hierarchy_hsa_all[1]]

hierarchy_hsa_all.columns = ['Parent', 'Child', 'Root']

#There are instances of duplicates, however all the child duplicates have the same root (even though different parents) after checking

root_pathways = {}
for pathway in list(H.nodes):
    index = hierarchy_hsa_all.Child[hierarchy_hsa_all.Child == pathway].index.tolist()[0]
    root_pathway  = hierarchy_hsa_all.Root[index]
    label = root_pathway_dict[root_pathway]
    root_pathways[pathway] = label


In [None]:
#Using Cecilia's code
 
#Shows all the root pathways in Reactome
set(hierarchy_hsa_all['Root'] )
#Shows all the root pathways present in the original dataset
set(root_pathways.values())

nx.set_node_attributes(G, root_pathways, "root_pathway")

print(G.number_of_nodes()) 

In [None]:
#Clustering with Louvain algorithm

#For some reason, not sure if really clustering by weight since it works with a typo 
#I think it works though, since changing the name gives a diff num of clusters even with seed set
#Resolution = 1 is the default, increasing resolution will yield more communities
louvain_clusters = nx.community.louvain_communities(G, weight='Squared_corr',seed=100,resolution=1.2)
print(len(louvain_clusters))

louvain_dict = {}
for index,grouping in enumerate(louvain_clusters):
    for pathway in grouping:
        louvain_dict[pathway] = index+1

nx.set_node_attributes(G, louvain_dict, "louvain")

In [None]:
#Assign pathway name to node

#If looking at metabolomic network:
#reactome_pathways = sspa.process_gmt("../Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt") 

#If looking at proteomic network:
reactome_pathways = sspa.process_reactome('Homo sapiens', infile = '../Data/UniProt2Reactome_All_Levels_ver84.txt', download_latest = False, filepath = None)

#If looking at integrated network:
#reactome_pathways = pd.read_csv("../Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str")


In [None]:
#Assign pathway name to node

pathway_name_dict = {reactome_pathways.index[i]:reactome_pathways["Pathway_name"][i] for i in range(0,len(reactome_pathways))}
#Filter dictionary to the pathways in the dataset only
pathway_name_dict = {k:pathway_name_dict[k] for k in list(G.nodes)}

nx.set_node_attributes(G, pathway_name_dict, "pathway_name")

In [None]:
#Save naive difference network

#nx.write_gml(G,'../Cytoscape/metabolomic_naive_diff.gml')
#nx.write_gml(G,'../Cytoscape/proteomic_naive_diff.gml')
#nx.write_gml(G,'../Cytoscape/integrated_naive_diff.gml')

### Constructing the differential networks

In [None]:
import networkx as nx

In [None]:
#Reading in the edges expressed in the differential network
#with open('../Data/permutation_test_files_metabolomics/sig_edges.txt') as f:
with open('../Data/permutation_test_files_proteomics/sig_edges.txt') as f:
#with open('../Data/permutation_test_files_integrated/sig_edges.txt') as f:
    lines = f.readlines()

edges_remaining = []

edges = lines[0].split(",")

#Turn 'edges' into a tuple format to compare edges 
for index in range(0,len(edges),2):
    list1 = edges[index],(edges[index+1][1:]) #becomes a tuple
    edges_remaining.append(list1)

#FOR THE EDGES MUST SORT OUT INTO TUPLES AS SOME TUPLES ARE THE SAME BUT ARE IN A DIFFERENT ORDER
#i.e.('R-HSA-192456', 'R-HSA-112315') and ('R-HSA-112315', 'R-HSA-192456')
edges_remaining =  [tuple(sorted(tuple1)) for tuple1 in edges_remaining]
print(len(edges_remaining))

In [None]:
#Reading in the directionality of the edges expressed in the differential network

#with open('../Data/permutation_test_files_metabolomics/sigedge_direction.txt') as f:
with open('../Data/permutation_test_files_proteomics/sigedge_direction.txt') as f:
#with open('../Data/permutation_test_files_integrated/sigedge_direction.txt') as f:
    lines = f.readlines()
    sigedge_direction = lines[0].split(",")

print(len(sigedge_direction))
print(sigedge_direction)

In [None]:
#Create a dictionary where the edges are assigned the direction in which they are significantly expressed in

condition_dict = {edges_remaining[i]:sigedge_direction[i] for i in range(0,len(sigedge_direction))}
print(condition_dict)

In [None]:
#Create a list of all the mild and severe edges from differential network

mild_diff_edges = []
severe_diff_edges = []


for key,value in condition_dict.items():
    if value == "mild":
        mild_diff_edges.append(key)
    if value == "severe":
        severe_diff_edges.append(key)


In [None]:
#Create the differential network

G=nx.from_edgelist(edges_remaining)
print(len(G.edges()))
nx.set_edge_attributes(G, condition_dict, "Condition")

Run the code above in Section 1, that I used to construct the differential network for the condition-specific networks to add the node attributes, before running the cells underneath:

In [None]:
#nx.write_gml(G,'../Cytoscape/metabolomic_differential.gml')
#nx.write_gml(G,'../Cytoscape/proteomic_differential.gml')
#nx.write_gml(G,'../Cytoscape/integrated_differential.gml')

In [None]:
G.number_of_edges()

### Constructing the consensus network

Even filtering at alpha < 1e-5 for the differential network, the proteomic and integrated network still leads to too many edges, so then I take the intersection of those edges with the ones from the condition-specific network to form the consensus network.

First load in the code from the first section to get the mild_naive and severe_naive list.

In [None]:
#Reading in the edges expressed in the differential network
#with open('../Data/permutation_test_files_metabolomics/sig_edges.txt') as f:
with open('../Data/permutation_test_files_proteomics/sig_edges.txt') as f:
#with open('../Data/permutation_test_files_integrated/sig_edges.txt') as f:
    lines = f.readlines()

edges_remaining = []

edges = lines[0].split(",")

#Turn 'edges' into a tuple format to compare edges 
for index in range(0,len(edges),2):
    list1 = edges[index],(edges[index+1][1:]) #becomes a tuple
    edges_remaining.append(list1)

#FOR THE EDGES MUST SORT OUT INTO TUPLES AS SOME TUPLES ARE THE SAME BUT ARE IN A DIFFERENT ORDER
#i.e.('R-HSA-192456', 'R-HSA-112315') and ('R-HSA-112315', 'R-HSA-192456')
edges_remaining =  [tuple(sorted(tuple1)) for tuple1 in edges_remaining]
print(len(edges_remaining))

In [None]:
#Reading in the directionality of the edges expressed in the differential network

#with open('../Data/permutation_test_files_metabolomics/sigedge_direction.txt') as f:
with open('../Data/permutation_test_files_proteomics/sigedge_direction.txt') as f:
#with open('../Data/permutation_test_files_integrated/sigedge_direction.txt') as f:
    lines = f.readlines()
    sigedge_direction = lines[0].split(",")

print(len(sigedge_direction))
print(sigedge_direction)

In [None]:
intersection1 = list(set(mild_diff_edges).intersection(list(set(mild_naive))))  
len(intersection1)

In [None]:
intersection2 = list(set(severe_diff_edges).intersection(list(set(severe_naive)))) 
len(intersection2) 

In [None]:
edges_for_consensus_network = intersection1 + intersection2

mild_dict = {intersection1[i]:"Mild" for i in range(0,len(intersection1))}
severe_dict = {intersection2[i]:"Severe" for i in range(0,len(intersection2))}

condition_dict = mild_dict.copy()
for key, value in severe_dict.items():
    condition_dict[key] = value

print(condition_dict)

In [None]:
#Overlap of mild naive network and severe differential network (should be 0)
intersection = list(set(severe_diff_edges).intersection(list(set(mild_naive))))  
len(intersection)

In [None]:
#Overlap of severe naive network and mild differential network (should be 0)
intersection = list(set(mild_diff_edges).intersection(list(set(severe_naive))))  
len(intersection)

In [None]:
#edges_remaining = all edges in the differential network
#Overlap of differential network edges and edges in both the mild and severe network (should be 0)

intersection = list(set(edges_remaining).intersection(list(set(mild_severe_naive)))) 
len(intersection) 

In [None]:
#Overlap of all differential network edges with all edges in the full correlation network
intersection = list(set(edges_remaining).intersection(list(set(all_edges))))   
len(intersection) 

In [None]:
G=nx.from_edgelist(edges_for_consensus_network)
print(len(G.edges()))
nx.set_edge_attributes(G, condition_dict, "Condition")

Run the code above in Section 1, that I used to construct the differential network for the condition-specific networks to add the node attributes, before running the cells underneath:

In [None]:
#No overlap between metabolomic and naive differential network,so don't need to re-do
#nx.write_gml(G,'../Cytoscape/proteomic_differential_intersect.gml') 
#nx.write_gml(G,'../Cytoscape/integrated_differential_intersect.gml')