In [1]:
import sys
from pathlib import Path

# add parent folder to the path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

# PanRicci

In [2]:
import networkx as nx
import numpy as np
import math
import ot

## Load GFA

In [3]:
from panricci.utils.gfa_loader import GFALoader

In [4]:
gfa_loader = GFALoader(undirected=False)
nodes, edges, G = gfa_loader("../data/test1.gfa")

In [5]:
nodes

{'1': {'label': 'AC', 'len': 2, 'node_depth': 1.0},
 '2': {'label': 'AA', 'len': 2, 'node_depth': 0.5},
 '3': {'label': 'CCTAAA', 'len': 6, 'node_depth': 1.0}}

In [6]:
edges

[('1', '3'), ('1', '2'), ('2', '3')]

## Probability distribution for a node

In [7]:
from panricci.distributions.variation_graph import DistributionNodes
distribution_nodes = DistributionNodes(G, alpha=0.5)

In [8]:
distribution_nodes(node="2")

{'2': 0.5, '3': 0.5}

## Ricci Flow

In [10]:
from panricci.ricci_flow import RicciFlow
ricci_flow = RicciFlow(G, dirsave_graphs="../output/test3/ricci-flow")

In [11]:
ricci_flow(iterations=10)

Ricci-Flow: 100%|██████████| 10/10 [00:00<00:00, 268.43it/s]


<networkx.classes.digraph.DiGraph at 0x7f509ddab410>

## Alignment of pangenome graphs

In [12]:
from panricci.utils.gfa_loader import GFALoader
from panricci.distributions.variation_graph import DistributionNodes
from panricci.ricci_flow import RicciFlow

In [13]:
path_graph1 = Path("../data/test1.gfa")
path_graph2 = Path("../data/test2.gfa")

graph1_name = path_graph1.stem
graph2_name = path_graph2.stem

# load graphs
gfa_loader = GFALoader(undirected=False)
_, _, graph1 = gfa_loader(path_graph1)
_, _, graph2 = gfa_loader(path_graph2)

# define distribution of nodes for each graph
graph1_distribution_nodes = DistributionNodes(graph1, alpha=0.5)
graph2_distribution_nodes = DistributionNodes(graph2, alpha=0.5)

# instantiate Ricci-Flow for each graph
graph1_ricci_flow = RicciFlow(graph1, dirsave_graphs=f"../output/{graph1_name}/ricci-flow")
graph2_ricci_flow = RicciFlow(graph2, dirsave_graphs=f"../output/{graph2_name}/ricci-flow")

# Run Ricci-Flow for a number of iterations
ITERATIONS=10
ricci_graph1 = graph1_ricci_flow(iterations=ITERATIONS, save_intermediate_graphs=True)
ricci_graph2 = graph2_ricci_flow(iterations=ITERATIONS, save_intermediate_graphs=True)

Ricci-Flow: 100%|██████████| 10/10 [00:00<00:00, 22.32it/s]
Ricci-Flow: 100%|██████████| 10/10 [00:00<00:00, 111.82it/s]


2. We add source and sink nodes with arbitrary distance of $1$ to both graphs

In [15]:
ricci_graph1.edges(), ricci_graph2.edges()

(OutEdgeView([('1', '3'), ('1', '2'), ('2', '3')]),
 OutEdgeView([('1', '3'), ('1', '2'), ('2', '3')]))

In [16]:
def get_sources_sinks(path_gfa):
    sources = []
    sinks   = []
    with open(path_gfa, "r") as fp:
        for line in fp.readlines():

            # paths
            nodes_path=[]
            if line.startswith("P"):
                _, seq_id, path, *_ = line.replace("\n","").split("\t")

                nodes = path.split(",")
                source = nodes[0]
                sink = nodes[-1]

                sourceid = source.replace("+","").replace("-","")
                sinkid = sink.replace("+","").replace("-","")

                sources.append(sourceid)
                sinks.append(sinkid)
    
    return sources, sinks


In [17]:
#  graph1 
sources, sinks = get_sources_sinks(path_graph1)
ricci_graph1.add_edges_from([("source",node) for node in sources] , distance=1, label="N")
ricci_graph1.add_edges_from([(node,"sink") for node in sinks] , distance=1, label="N")
ricci_graph1.nodes(), ricci_graph1.edges()

(NodeView(('1', '2', '3', 'source', 'sink')),
 OutEdgeView([('1', '3'), ('1', '2'), ('2', '3'), ('3', 'sink'), ('source', '1')]))

In [18]:
#  graph2 
sources, sinks = get_sources_sinks(path_graph2)
ricci_graph2.add_edges_from([("source",node) for node in sources] , distance=1, label="N")
ricci_graph2.add_edges_from([(node,"sink") for node in sinks] , distance=1, label="N")
ricci_graph2.nodes(), ricci_graph2.edges()

(NodeView(('1', '2', '3', 'source', 'sink')),
 OutEdgeView([('1', '3'), ('1', '2'), ('2', '3'), ('3', 'sink'), ('source', '1')]))

3. Given source node $s$ and sink node $t$. For each graph, and each node $ u \neq s,t$ compute the vectors $$u_L = [d(s,u), d(u,t)]$$

**Graph1**

In [19]:
sp_from_source = nx.shortest_path(ricci_graph1, source="source", weight="distance", method="dijkstra")
sp_until_sink = nx.shortest_path(ricci_graph1, target="sink", weight="distance", method="dijkstra")

del sp_from_source["source"]
del sp_until_sink["sink"]
# sp_from_source
# sp_until_sink

In [20]:
costs_from_source = dict()
costs_until_sink = dict()
for start_node, path in sp_from_source.items():
    nodes = path
    edges = [(n1,n2) for n1,n2 in zip(nodes[:-1], nodes[1:])]
    cost  = np.sum([ricci_graph1.edges[e]["distance"] for e in edges])
    costs_from_source[start_node] = cost

for end_node, path in sp_until_sink.items():
    nodes = path
    edges = [(n1,n2) for n1,n2 in zip(nodes[:-1], nodes[1:])]
    cost  = np.sum([ricci_graph1.edges[e]["distance"] for e in edges])
    costs_until_sink[end_node] = cost

costs_from_source, costs_until_sink

({'1': 1,
  '3': 1.6318812829104834,
  '2': 1.3531559590013926,
  'sink': 2.6318812829104834},
 {'3': 1, '1': 1.6318812829104834, '2': 1.5, 'source': 2.6318812829104834})

In [21]:
graph1_vector_nodes = {node: np.array([costs_from_source[node], costs_until_sink[node]]) for node in ricci_graph1.nodes() if node not in ["source", "sink"]}
graph1_vector_nodes

{'1': array([1.        , 1.63188128]),
 '2': array([1.35315596, 1.5       ]),
 '3': array([1.63188128, 1.        ])}

**Graph 2**

In [22]:
sp_from_source = nx.shortest_path(ricci_graph2, source="source", weight="distance", method="dijkstra")
sp_until_sink = nx.shortest_path(ricci_graph2, target="sink", weight="distance", method="dijkstra")

del sp_from_source["source"]
del sp_until_sink["sink"]

In [24]:
costs_from_source = dict()
costs_until_sink = dict()
for start_node, path in sp_from_source.items():
    nodes = path
    edges = [(n1,n2) for n1,n2 in zip(nodes[:-1], nodes[1:])]
    cost  = np.sum([ricci_graph1.edges[e]["distance"] for e in edges])
    costs_from_source[start_node] = cost

for end_node, path in sp_until_sink.items():
    nodes = path
    edges = [(n1,n2) for n1,n2 in zip(nodes[:-1], nodes[1:])]
    cost  = np.sum([ricci_graph1.edges[e]["distance"] for e in edges])
    costs_until_sink[end_node] = cost

costs_from_source, costs_until_sink

({'1': 1,
  '3': 1.6318812829104834,
  '2': 1.3531559590013926,
  'sink': 2.6318812829104834},
 {'3': 1, '1': 1.6318812829104834, '2': 1.5, 'source': 2.6318812829104834})

In [25]:
graph2_vector_nodes = {node: np.array([costs_from_source[node], costs_until_sink[node]]) for node in ricci_graph2.nodes() if node not in ["source", "sink"]}
graph2_vector_nodes

{'1': array([1.        , 1.63188128]),
 '2': array([1.35315596, 1.5       ]),
 '3': array([1.63188128, 1.        ])}

4. Find the alignment between both graphs using [minimum_weight_full_matching](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.matching.minimum_weight_full_matching.html)

- First create a bipartite graph with nodes of both graphs in each component
- Then add the weights of matching each pair of nodes as $|| u_1 - u_2 ||$

In [26]:
# remove source and sink nodes from both graphs
ricci_graph1.remove_nodes_from(["source","sink"])
ricci_graph2.remove_nodes_from(["source","sink"])

In [27]:
# create bipartite graph 
# nodes are labeled as '<node>-1', if <node> it belongs to the first graph, and '<node>-2' from the second one
# the cost/weight of an edge (u,v) correspong for the  
bipartite_graph = nx.Graph()
# nodes = []
# edges = []
for node1 in ricci_graph1.nodes():
    for node2 in ricci_graph2.nodes():
        weight = np.linalg.norm(graph1_vector_nodes[node1] - graph2_vector_nodes[node2])
        bipartite_graph.add_edge(node1+"-1", node2+"-2", weight=weight)

In [28]:
bipartite_graph.edges()

EdgeView([('1-1', '1-2'), ('1-1', '2-2'), ('1-1', '3-2'), ('1-2', '2-1'), ('1-2', '3-1'), ('2-2', '2-1'), ('2-2', '3-1'), ('3-2', '2-1'), ('3-2', '3-1')])

In [29]:
alignment = nx.bipartite.minimum_weight_full_matching(bipartite_graph, weight="weight")
alignment

{'3-1': '3-2',
 '1-1': '1-2',
 '2-1': '2-2',
 '3-2': '3-1',
 '1-2': '1-1',
 '2-2': '2-1'}