In [1]:
import networkx as nx
import numpy as np
import pandas as pd

## Load REACTOME pathways

In [2]:
import re

reactome_loc = "../data/rsingh/ReactomePathways.gmt"
pathways     = {}
desc         = {}
pathprots_h  = set()
with open(reactome_loc, "r") as rl:
    for line in rl:
        entries = re.split(r"\t", line.strip())
        pathways[entries[1]] = entries[2:]
        pathprots_h.update(entries[2:])
        desc[entries[1]]     = entries[0]

In [3]:
len(pathways)

2546

## Load the Namespace

In [4]:
namespace = pd.read_csv("../data/rsingh/Biomart_Entrez-to-Uniprot_mapping.tsv", sep = "\t")
namespace = namespace.dropna(subset = [namespace.columns[3], namespace.columns[6]])
namespace.head().T

Unnamed: 0,0,1,2,3,4
Gene stable ID,ENSG00000198888,ENSG00000198888,ENSG00000198888,ENSG00000198763,ENSG00000198763
Gene stable ID version,ENSG00000198888.2,ENSG00000198888.2,ENSG00000198888.2,ENSG00000198763.3,ENSG00000198763.3
UniProtKB/TrEMBL ID,U5Z754,U5Z754,U5Z754,A0A1X7RBG6,A0A1X7RBG6
UniProtKB/Swiss-Prot ID,P03886,P03886,P03886,P03891,P03891
Gene Synonym,MTND1,NAD1,ND1,MTND2,NAD2
NCBI gene (formerly Entrezgene) ID,4535.0,4535.0,4535.0,4536.0,4536.0
Gene name,MT-ND1,MT-ND1,MT-ND1,MT-ND2,MT-ND2


In [5]:
humap = {k: v for v, k in namespace.iloc[:, [3, 6]].values if all([k != None, v != None])}
uhmap = {v: k for k, v in humap.items()}
len(uhmap)

19221

## Process the Pathways based on this map

In [6]:
fpathways = {}
for key, value in pathways.items():
    accept = all([pid in humap for pid in value]) and len(value) >= 2
    if accept:
        fpathways[key] = [f"uniprotkb:{humap[k]}" for k in value]
len(fpathways), len(pathways)

(1571, 2546)

In [39]:
def check_path_strict(G, path):
    assert len(path) >= 2
    curr = path[0]
    for nxt in path[1:]:
        if not G.has_edge(curr, nxt):
            return 0
        curr = nxt
    return 1

def check_path_lenient(G, path):
    assert len(path) >= 2
    curr = path[0]
    for nxt in path[1:]:
        if G.has_edge(curr, nxt):
            return 1
        curr = nxt
    return 0

def check_path_ratio(G, path):
    assert len(path) >= 2
    curr  = path[0]
    ntrue = 0
    for nxt in path[1: ]:
        if G.has_edge(curr, nxt):
            ntrue += 1
    return float(ntrue) / (len(path) - 1)

def check_paths(G, pathways, check_types = []):
    assert len(check_types) > 0
    res = np.zeros((len(check_types), ))
    for _, path in pathways.items():
        for i, check in enumerate(check_types):
            res[i] += check(G, path)
    res /= len(pathways)
    return res
    
checks = [check_path_strict, check_path_ratio]

In [40]:
def filter_pathway(pathways, nodes, remove = True):
    """
    If remove is true, remove the pathway in `pathways` containing an entry which is not the member of the nodes
    """
    new_pathways = {}
    for (key, pathway) in pathways.items():
        new_pathway = []
        reject_path = False
        for node in pathway:
            if node not in nodes:
                if remove:
                    reject_path = True
                    break
                else:
                    continue
            new_pathway.append(node)
        if len(new_pathway) > 1 and (not reject_path):
            new_pathways[key] = new_pathway
    return new_pathways

## Load Complete COIP network

In [41]:
import networkx as nx
cf  = pd.read_csv("../data/networks/coip_hc_full.tsv", delim_whitespace = True, header = None)
Gcf = nx.from_pandas_edgelist(cf, 0, 1, 2)

### Filter Paths

In [42]:
cpathsr = filter_pathway(fpathways, set(Gcf.nodes()))
len(cpathsr)

320

In [43]:
check_paths(Gcf, cpathsr, checks)

array([0.11875   , 0.20358331])

## Load Complete Y2H network

In [51]:
hs  = pd.read_csv("../data/networks/y2h_hc_full.tsv", delim_whitespace = True, header = None)
Ghf = nx.from_pandas_edgelist(hs, 0, 1, 2)
ypathsr = filter_pathway(fpathways, set(Ghf.nodes()))
print(len(ypathsr))
check_paths(Ghf, ypathsr, checks)

71


array([0.18309859, 0.13411581])

## Load Shared COIP network + Filter

In [50]:
csf = pd.read_csv("../data/networks/coip_hc_shared.tsv", delim_whitespace = True, header = None)
Gcs = nx.from_pandas_edgelist(csf, 0, 1, 2)

cspathsr = filter_pathway(fpathways, set(Gcs.nodes()))
print(f"Length of pathways: {len(cspathsr)}")
check_paths(Gcs, cspathsr, checks)

Length of pathways: 55


array([0.4       , 0.33747475])

## Load Shared Y2H network + Filter

In [48]:
ysf = pd.read_csv("../data/networks/y2h_hc_shared.tsv", delim_whitespace = True, header = None)
Gys = nx.from_pandas_edgelist(ysf, 0, 1, 2)

yspathsr = filter_pathway(fpathways, set(Gys.nodes()))
print(f"Length of pathways: {len(yspathsr)}")
check_paths(Gys, yspathsr, checks)

Length of pathways: 51


array([0.18671024])

In [49]:
checks

[<function __main__.check_path_strict(G, path)>,
 <function __main__.check_path_ratio(G, path)>]