In [6]:
import pandas as pd
import networkx as nx
from typing import List,Tuple
from pyvis.network import Network
from collections import Counter

In [7]:
# functions to manipulate network 

# given CpG id return the connected network 
def subgraph_by_cpg(net:nx.classes.digraph.DiGraph,cpg:str)->nx.classes.digraph.DiGraph:
    undirected_net:nx.classes.graph.Graph = net.to_undirected() # turn network to undirected to locate all CpG associations by LD
    reachable_nodes:list[str] = nx.shortest_path(undirected_net,cpg).keys() # get all reachable nodes given CpG id
    return net.subgraph(reachable_nodes)



In [8]:

file_name = "ld_clump_assoc.txt"  # ld - clump association file
data = pd.read_csv('data/'+file_name,delimiter = "\t")

# find cpgs with highest number of snp connections in the network to plot 

top_n = 10000
cpg_ids = [i[0] for i in Counter(data['CpG'].values).most_common()[:top_n]] 

# some basic filters for the data

#data = data[data['CpG chr']== 1]
#data = data[data['CpG'].isin(cpg_ids)]



In [9]:
# network elements (nodes,edges) 

# NODES 

cpgs:List[str] = data['CpG'].unique() # cpg nodes
snps:List[str]= data['Top SNP'].unique() # snp nodes
lds:List[str] = data['LD clump'].unique() # ld clump nodes

# networkX node format with added color attributes

nodes_cpg = [(cpg, {'color':'blue'} ) for cpg in cpgs] 
nodes_snp = [(snp, {'color':'green'}) for snp in snps]
nodes_LD = [('ld_' + ld, {'color':'yellow'}) for ld in lds]

# EDGES

cpg_snp_edges:list[Tuple] = [(cpg,snp) for cpg,snp in zip(data['CpG'],data['Top SNP'])] # cpg-snp
snp_ld_edges:list[Tuple] = [(snp,'ld_'+ld) for snp,ld in zip(data['Top SNP'],data['LD clump'])]
cpg_ld_edges:list[Tuple] = [(cpg,'ld_'+ld) for cpg,ld in zip(data['CpG'],data['LD clump'])]




In [10]:
# initiate network X directed graph 

cpgNet = nx.DiGraph()

# add nodes to the directed graph

cpgNet.add_nodes_from(nodes_cpg)
#cpgNet.add_nodes_from(nodes_snp)
cpgNet.add_nodes_from(nodes_LD)

# add edges to the directed graph 

#cpgNet.add_edges_from(cpg_snp_edges,color='black')
#cpgNet.add_edges_from(snp_ld_edges,color='red')
cpgNet.add_edges_from(cpg_ld_edges,color='black')

# initiate pyVis network drawer for web page interactive view

# removing nodes which has a degree 1 : can be a LD-clump connected to a single CpG, a CpG with only one LD clump ...
nodes_with_degree_1 = [node[0] for node in dict(cpgNet.degree).items() if node[1]==1]
cpgNet.remove_nodes_from(nodes_with_degree_1)

#cpgNet = subgraph_by_cpg(cpgNet,cpg_ids[-1])
len(cpgNet.nodes)

212855

### to-dos: 
- add summary statistics-based filters to filter-out non-significant snp-cpg pairs
- add weight attribute to network graphs to encode pair significance

- removing SnPs (LD clump) show specific snp by clicking to the LD clump (SNPs removed)
- try prototype on the whole network
	- to see the clusters 
- check color scheme 
	- saturated colors 
	- Rampvis color scheme 
- given cpg, write a function to calculate the associated CpGs in the network 

In [20]:
copy_net = cpgNet
undirected_net:nx.classes.graph.Graph = cpgNet.to_undirected()
allowed_jumps =  2
reachable_nodes = [node[0] for node in nx.shortest_path_length(undirected_net,source=cpg_ids[0]).items() if node[1] <= allowed_jumps] 
filt_net = copy_net.subgraph(reachable_nodes)
len(filt_net.nodes)

465

In [25]:
net_gen:Network = Network()
net_gen.from_nx(filt_net)
net_gen.set_options('''
const options = {
  "physics": {
    "solver": "forceAtlas2Based"
  }
}''')
net_gen.show('top5CpG.html')