In [79]:
from scipy.stats import fisher_exact
from statsmodels.sandbox.stats.multicomp import multipletests
import networkx as nx
import pandas as pd

def geneset_enrich(term_set, genesets, bk_set):
    res = []

    ps = term_set & bk_set
    for key, tmp in genesets.items():
        gs = tmp & bk_set
        if not gs or not ps & gs:
            continue
        cmat = np.array([[len(gs & ps), len(ps - gs)], [len(gs - ps), len(bk_set) - len(ps | gs)], ])
        #print cmat
        oddr, p = fisher_exact(cmat, 'greater')
        res.append([key, oddr, p])
    
    if res:
        adj_p = multipletests([p[2] for p in res])[1]
        res = [gs + [ap] for gs, ap in zip(res, adj_p)]
    return res ## ['term name', odd ratio, p-val, adjusted p-val]

In [81]:
tissue = 'Adipose_Subcutaneous'
df_bn = pd.read_csv('../processed_data/GTEX_BN/GTEx_%s_EdgeFile.csv' % tissue)
df_wgcna = pd.read_csv('./WGCNA/%s/gene_module.csv' % tissue, header = None)


G0 = nx.DiGraph()
G0.add_edges_from([(s.split('.')[0], t.split('.')[0]) for s, t in zip(df_bn.From, df_bn.To)])
D0 = nx.shortest_path_length(G0)

G1 = nx.DiGraph()
G1.add_edges_from([(t.split('.')[0], s.split('.')[0]) for s, t in zip(df_bn.From, df_bn.To)])
D1 = nx.shortest_path_length(G1)

D0 = nx.shortest_path_length(G0)

ch_set = {}
for g, item in D0:
    tmp = set(item.keys()) - set([g]) 
    if len(tmp) < 3:
        continue
    ch_set[g] = tmp
    
mod2genes ={k:set(item[0]) for k, item in df_wgcna.groupby(1) if k != 'grey'}

res_all = {}
for k, gs in mod2genes.items():
    res = geneset_enrich(gs, ch_set, background0)
    if res:
        res_all[k] = res