## Generate the Correlation Matrix

In [22]:
import pandas as pd
coexp_df = pd.read_csv("../data/rsingh/hs_encode_rnaseq_NORMALIZED.csv")

In [23]:
coexp_mat = coexp_df.to_numpy()
coexp_map = {k: i for i, k in enumerate(coexp_df.columns)}
coexp_mat = coexp_mat.T

In [24]:
list(coexp_map.keys())[:10]

['ENSG00000000003',
 'ENSG00000000005',
 'ENSG00000000419',
 'ENSG00000000971',
 'ENSG00000001036',
 'ENSG00000001167',
 'ENSG00000001460',
 'ENSG00000001497',
 'ENSG00000001561',
 'ENSG00000001617']

In [26]:
r_coexp_map = {i: k for k, i in coexp_map.items()}
name_df     = pd.read_csv("../data/rsingh/Biomart_Entrez-to-Uniprot_mapping.tsv", sep = "\t")
gid_uprot   = {k: f"uniprotkb:{v}" for k, v in name_df.iloc[:, [0, 3]].values} 
name_df.head().T

Unnamed: 0,0,1,2,3,4
Gene stable ID,ENSG00000198888,ENSG00000198888,ENSG00000198888,ENSG00000198763,ENSG00000198763
Gene stable ID version,ENSG00000198888.2,ENSG00000198888.2,ENSG00000198888.2,ENSG00000198763.3,ENSG00000198763.3
UniProtKB/TrEMBL ID,U5Z754,U5Z754,U5Z754,A0A1X7RBG6,A0A1X7RBG6
UniProtKB/Swiss-Prot ID,P03886,P03886,P03886,P03891,P03891
Gene Synonym,MTND1,NAD1,ND1,MTND2,NAD2
NCBI gene (formerly Entrezgene) ID,4535.0,4535.0,4535.0,4536.0,4536.0
Gene name,MT-ND1,MT-ND1,MT-ND1,MT-ND2,MT-ND2


In [27]:
from scipy.stats.stats import pearsonr
import numpy as np

n = len(coexp_map)

# The final correlation matrix
corrmat = np.zeros((n, n))

try:
    for i in range(n):
        for j in range(i):
            corr, pval = pearsonr(coexp_mat[i], coexp_mat[j])
            corrmat[i, j] = corr
            corrmat[j, i] = corr
except:
    print(f"{i} {j}")
corrmat



array([[ 0.        ,  0.19728986,  0.0382598 , ...,         nan,
                nan,         nan],
       [ 0.19728986,  0.        , -0.11627219, ...,         nan,
                nan,         nan],
       [ 0.0382598 , -0.11627219,  0.        , ...,         nan,
                nan,         nan],
       ...,
       [        nan,         nan,         nan, ...,  0.        ,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
         0.        ,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,  0.        ]])

In [28]:
corrmat = np.where(corrmat == np.nan, 0, corrmat)
corrmat = np.absolute(corrmat)
np.save("data+results/correlation-mat.npy", corrmat)

In [29]:
row, col = np.nonzero(np.triu(corrmat))

In [30]:
val = [corrmat[r, c] for r, c in zip(row, col)]

### Applying the thresholding of 0.6. 
Citation: https://academic.oup.com/bioinformatics/article/23/16/2096/198635

In [31]:
import networkx as nx

G_all = nx.Graph()
edges_t       = [(r_coexp_map[i], r_coexp_map[j], w) for i, j, w in zip(row, col, val) if w >= 0.6]
uprot_edges_t = [(gid_uprot[k], gid_uprot[v], w) for k, v, w in edges_t if 
                (k in gid_uprot and v in gid_uprot)] 
G_all.add_weighted_edges_from(uprot_edges_t)

#### Saving

In [32]:
nx.write_weighted_edgelist(G_all, "../results/coexp_0.6.txt")

In [33]:
cc = max(nx.connected_components(G_all), key = len)
len(cc), G_all.number_of_nodes()

(4998, 5027)

In [35]:
import json
np.save("data+results/correlation-mat.npy", corr)
with open("correlation.json", "w") as oj:
    json.dump(coexp_map, oj)

## Loading the networks

In [41]:
import pandas as pd

coexpdf = pd.read_csv("../results/coexp_0.6.txt", sep = "\t", header = None)
coexpdf

Unnamed: 0,0,1,2,3
0,4764,uniprotkb:P20618,uniprotkb:P53701,0.622359
1,4765,uniprotkb:P20618,uniprotkb:Q9NS18,0.688044
2,4766,uniprotkb:P20618,uniprotkb:P82664,0.656090
3,4767,uniprotkb:P20618,uniprotkb:Q9UKD2,0.645368
4,4768,uniprotkb:P20618,uniprotkb:Q9ULW0,0.621352
...,...,...,...,...
169396,174160,uniprotkb:P10147,uniprotkb:Q8NHW4,0.647214
169397,174161,uniprotkb:P10147,uniprotkb:P16619,0.712838
169398,174162,uniprotkb:Q8NHW4,uniprotkb:P16619,0.609473
169399,174163,uniprotkb:Q9UI38,uniprotkb:A6NEH6,0.830250


### Loading Y2H and coIP networks

In [42]:
coip = pd.read_csv("../data/networks/coip_hc_full.tsv", sep = "\t", header = None)
y2h =  pd.read_csv("../data/networks/y2h_hc_full.tsv", sep = "\t", header = None)
coip_sh =  pd.read_csv("../data/networks/coip_hc_shared.tsv", sep = "\t", header = None)
y2h_sh =  pd.read_csv("../data/networks/y2h_hc_shared.tsv", sep = "\t", header = None)
coip

Unnamed: 0,0,1,2
0,uniprotkb:A0A0B4J1S8,uniprotkb:Q9H3P7,0.57
1,uniprotkb:Q9H3P7,uniprotkb:O43493,0.55
2,uniprotkb:Q9H3P7,uniprotkb:Q8WUA7,0.64
3,uniprotkb:Q9H3P7,uniprotkb:Q08378,0.40
4,uniprotkb:Q9H3P7,uniprotkb:Q9UBF8,0.54
...,...,...,...
29228,uniprotkb:Q9Y3C0,uniprotkb:Q9Y2V7,0.67
29229,uniprotkb:Q9Y3D3,uniprotkb:Q9Y4X4,0.42
29230,uniprotkb:Q9Y4X4,uniprotkb:Q8NCR0,0.42
29231,uniprotkb:Q9Y6I4,uniprotkb:Q8N5D0,0.40


In [43]:
def getnodes(netdf):
    return set(netdf[0]).union(set(netdf[1]))
cn = getnodes(coip)
yn = getnodes(y2h)
csn = getnodes(coip_sh)
ysn = getnodes(y2h_sh)
coexpn = getnodes(coexpdf)

In [45]:
len(cn.intersection(coexpn)), len(yn.intersection(coexpn)), len(csn.intersection(coexpn)), len(ysn.intersection(coexpn)), len(coexpn), len(cn), len(yn)

(1151, 1321, 545, 534, 173070, 8433, 7364)

In [46]:
import networkx as nx

def get_intersection_ratio(dfnet, dfco):
    Gnet = nx.from_pandas_edgelist(dfnet, 0, 1, 2)
    Gco = nx.from_pandas_edgelist(dfco, 0, 1, 2)
    node_inter = set(Gnet.nodes()).intersection(set(Gco.nodes()))
    print(len(node_inter))
    Gsubnet = Gnet.subgraph(list(node_inter))
    Gsubco  = Gco.subgraph(list(node_inter))
    
    Gsubint = nx.intersection(Gsubnet, Gsubco)
    return float(Gsubint.number_of_edges()) / Gsubco.number_of_edges()

get_intersection_ratio(coip, coexpdf)

ZeroDivisionError: float division by zero