In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import pickle                


In [2]:
df = pd.read_csv('../../data/gene_info/mart_export_GRCh38.p12.txt',dtype = {'NCBI gene ID':str}, sep = '\t')

entrez2ensembl = {}
for s0, s1 in zip(df['Gene stable ID'], df['NCBI gene ID']):
    if str(s1) != 'nan':
        entrez2ensembl[str(s1)] = str(s0)

In [3]:
df_ms = pd.read_csv('../../multiplespecies_network/stab1.csv')

## map ms-genes to ensembl
mg2g = {}
for g, mg in zip(df_ms['Human'], df_ms['Meta-gene']):
    if (' ' in g) or (g == 'No Human gene'): ## multiple genes or not human gene
        continue
    
    
    if str(g) not in entrez2ensembl:
        continue
        
    mg2g[mg] = entrez2ensembl[str(g)]   
    
msgenes = set([g for mg, g in mg2g.items()])

In [4]:
df_ms = pd.read_csv('../../multiplespecies_network/interactions.tab', sep = '\t')
            

In [9]:
from itertools import combinations
from scipy.stats import fisher_exact

path = '../../R_script/WGCNA/'
res = []

for t in os.listdir(path):
    dfm = pd.read_csv(path + '%s/gene_module.csv' % t, header = None)
    
    ## common genes in wgcna & multiple-species network
    common_genes = set(dfm[0]) & msgenes
    ## get order for genes to avoid double count edges
    g2ord = {g:i for i, g in enumerate(sorted(common_genes))}    
    
    ## get module genes
    mod2genes = {}
    for g, m in zip(dfm[0], dfm[1]):
        if g not in common_genes:
            continue
        if m == 'grey':
            continue

        if m not in mod2genes:
            mod2genes[m] = set()
        mod2genes[m].add(g)

    ## get multispecies edges between common genes
    elist_ms = set()
    for g1, g2 in zip(df_ms['Meta-gene'], df_ms['Neighbor']):
        g1, g2 = str(g1), str(g2)
        if g1 == g2:
            continue    
        if (g1 not in mg2g) or (g2 not in mg2g):
            continue
            
        g1, g2 = mg2g[g1], mg2g[g2]
        #g1, g2 = entrez2ensembl[g1], entrez2ensembl[g2]
        if (g1 not in common_genes) or (g2 not in common_genes):
            continue
            
        if g2ord[g1] < g2ord[g2]:
            elist_ms.add((g1, g2))
        else:
            elist_ms.add((g2, g1))

    ## get wgcna edges between common genes
    elist_wgcna = set()
    for m, gs in mod2genes.items():
        for g1, g2 in combinations(gs, r = 2):
            if g1 != g2:
                if g2ord[g1] < g2ord[g2]:
                    elist_wgcna.add((g1, g2))
                else:
                    elist_wgcna.add((g2, g1))
    
    n_genes = len(common_genes)
    
    n1 = len(elist_ms & elist_wgcna)
    n2 = len(elist_ms - elist_wgcna)
    n3 = len(elist_wgcna - elist_ms)
    n4 = n_genes * (n_genes - 1) - len(elist_ms | elist_wgcna)
    cmap = [[n1, n2], [n3, n4]]   
    oddr, pval = fisher_exact(cmap, alternative='greater')

    res.append((t, len(elist_ms), len(elist_wgcna), n1, float(n1) / float(len(elist_ms)), oddr, pval))


In [10]:
pd.DataFrame(res)

Unnamed: 0,0,1,2,3,4,5,6
0,Adipose_Subcutaneous,2063,5109,106,0.051381,15.087752,1.475148e-82
1,Adipose_Visceral_Omentum,868,736,22,0.025346,20.401984,3.000265e-21
2,Adrenal_Gland,526,957,41,0.077947,34.715905,1.055257e-46
3,Artery_Aorta,1421,3867,130,0.091485,26.672426,9.548233e-130
4,Artery_Coronary,416,1027,94,0.225962,80.16436,1.870788e-133
5,Artery_Tibial,2167,2237,75,0.03461,23.627922,8.370306e-73
6,Brain_Anterior_cingulate_cortex_BA24,318,835,4,0.012579,2.929982,0.05106284
7,Brain_Caudate_basal_ganglia,632,1485,65,0.102848,26.945525,9.375497e-66
8,Brain_Cerebellar_Hemisphere,593,735,17,0.028668,16.651711,2.326292e-15
9,Brain_Cerebellum,1116,1156,52,0.046595,29.995301,6.245883e-56


In [13]:
pd.DataFrame(res)[4].mean()

0.05383378413971012