In [1]:
import os
import pandas as pd
import numpy as np
import h5py
from collections import Counter
try:
    import cPickle as pickle
except:
    import pickle

import sys
sys.path.append(os.path.abspath('../src'))

from enrichment_helper import *

from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.proportion import proportion_confint
import scipy.stats as stats

import sqlite3
with open('../src/ensembl2genename_GRCh38.85.p', 'rb') as reader:
    gid2gn = pickle.load(reader)   
    
  

In [2]:
selected_phenotype = 'IBD.EUR.Ulcerative_Colitis'
lincs_file = 'CD_signatures/UC_lincs_l1000.p'
fn_out = 'UC_targets.csv'
mgi_search_criteria = lambda x: (('bowel' in x.lower()) or ('ulcer' in x.lower()))
creed_search_criteria = lambda x: 'bowel' in x.lower() or 'ulcer' in x.lower()

In [3]:
## MGI
hg2mg, mg, mg2pheno, pheno, pheno2name = load_mgi('../../../project_wrapup/GSK_project/GWAS_component/data/MGI/')  

df = pd.read_csv('../../../project_wrapup/GSK_project/GWAS_component/data/MGI/MGI_DO.rpt', sep = '\t')

dis2pheno = {}
for d, m in zip(df['DO Disease Name'], df['Mouse MGI ID']):
    if mgi_search_criteria(d) and m != None:
        if m in mg:
            print d, m, m in mg, len(mg2pheno[m])
            dis2pheno[m] = set(mg2pheno[m])
        #else:
            #print d, m, m in mg

inflammatory bowel disease MGI:102672 True 96
inflammatory bowel disease MGI:104798 True 221
inflammatory bowel disease MGI:103038 True 173
inflammatory bowel disease MGI:109495 True 45
inflammatory bowel disease MGI:1339364 True 44
inflammatory bowel disease MGI:96548 True 104
inflammatory bowel disease MGI:96537 True 188
inflammatory bowel disease MGI:88337 True 57
inflammatory bowel disease MGI:96549 True 38
inflammatory bowel disease 1 MGI:2429397 True 37
inflammatory bowel disease 10 MGI:1924290 True 40
inflammatory bowel disease 12 MGI:95772 True 78
inflammatory bowel disease 13 MGI:97570 True 40
inflammatory bowel disease 16 MGI:2180140 True 13


In [4]:
with open('./gcomp_drive_genes.p', 'rb') as reader:
    drivers = set([gid2gn[item[0]] for s, item in pickle.load(reader)[selected_phenotype]])

In [5]:
#!/usr/bin/env python
import pandas
import sqlite3

connection = sqlite3.connect("../../data/gwas_g2p/metaxcan_results_p/metaxcan_results_v1.5.db")

query = 'SELECT g.gene_name, m.zscore, m.n_snps_used, m.n_snps_model, p.tag as phenotype, t.tissue as tissue, g.gene ' 
query += ' FROM gene AS g INNER JOIN metaxcan_result AS m ON g.id = m.gene_id' 
query += ' INNER JOIN tissue AS t ON t.id = m.tissue_id  INNER JOIN pheno AS p ON p.id = m.pheno_id'
query += ' WHERE p.tag = \'%s\'' % selected_phenotype

pi = pandas.read_sql_query(query, connection)
pi = pi[pi.tissue != 'DGN_WB']


In [6]:
from scipy.stats import norm 
import statsmodels.stats.multitest as multi

pvals = norm.sf(np.abs(pi.zscore)) * 2
_, res_adjp, _, _ = multi.multipletests(pvals)


In [7]:
sgenes = set(pi.gene_name[res_adjp < 0.05])
print len(sgenes)

117


In [8]:
mgi_res_meta = mgi_match(sgenes, hg2mg, mg2pheno, dis2pheno, pheno, alpha_sig = (0.05) / float(len(dis2pheno)))
mgi_hit_meta = set([g for g, c in mgi_res_meta.items()])
n1 = len(mgi_hit_meta)
n0 = len([g for g in sgenes if g in hg2mg])
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(28, 104, 0.2692307692307692, (0.18398282908661615, 0.3544787093749223))


In [9]:
mgi_res_gc = mgi_match(drivers, hg2mg, mg2pheno, dis2pheno, pheno, alpha_sig = (0.05) / float(len(dis2pheno)))
mgi_hit_gc = set([g for g, c in mgi_res_gc.items()])
n1 = len(mgi_hit_gc)
n0 = len([g for g in drivers if g in hg2mg])
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(35, 97, 0.36082474226804123, (0.2652550128210566, 0.45639447171502584))


In [10]:
s = ''
for i, g in enumerate(drivers | sgenes):
    if (i % 8) == 0:
        print(s)
        s = ''
    s += '\'%s\', ' % g


'TMEM89', 'PGAP3', 'TAP2', 'TMOD1', 'CTD-2330K9.3', 'CKM', 'SPTB', 'AGER', 
'SELPLG', 'LAPTM5', 'ARFRP1', 'STK19', 'MYL2', 'RASAL1', 'NKX2-3', 'KIF2C', 
'C6orf15', 'CACNA1H', 'GNL1', 'DDX39B', 'MT-ND5', 'MT-ND6', 'TYROBP', 'GSDMB', 
'NXPE1', 'DLGAP5', 'APEH', 'HLA-DQB1', 'MYB', 'PYGM', 'ZPBP2', 'HLA-DPB1', 
'TBXAS1', 'PSORS1C1', 'ARHGEF19', 'ALOX5', 'HLA-DRB1', 'KRT24', 'HLA-DRB5', 'GALNT15', 
'IP6K2', 'CXCL16', 'ORMDL3', 'LST1', 'FKBPL', 'MST1R', 'C1QA', 'BAK1', 
'ZGPAT', 'PRSS22', 'DHX16', 'TOP2A', 'TCF19', 'CD3E', 'DDAH2', 'STMN3', 
'DOCK2', 'ZWINT', 'CDKN1A', 'ANO1', 'C3orf62', 'TNFRSF14', 'LY6G5C', 'SFTA2', 
'PKHD1L1', 'NPRL2', 'SLC9A4', 'HCG27', 'TNNT2', 'APOM', 'MST1', 'APOH', 
'CARD9', 'SKIV2L', 'PRKG1', 'C4A', 'C4B', 'HLA-DQB2', 'PCOLCE', 'GPANK1', 
'NPPB', 'AMT', 'TMEM45B', 'ALPK3', 'AGPAT1', 'DLD', 'FAM212A', 'UBA7', 
'DAG1', 'BLK', 'HCK', 'BTNL2', 'HOXA-AS3', 'DNER', 'ITGAX', 'C6orf48', 
'IRF8', 'KCNH8', 'CDC20', 'GPX1', 'MELK', 'STK33', 'SORT1', 'VPS52', 
'NOV', 'KLHDC8B'

In [11]:
## Lincs 1000
df0 = pd.read_csv('../../../data/CREED/Disease_entire_exp_landmark.tsv.gz', sep = '\t')
df1 = pd.read_csv('../../../data/CREED/Disease_signatures_info_data.tsv', sep = '\t')

ind = [d0 for d0, d1 in zip(df1.id, df1.disease_name) if creed_search_criteria(d1)]
dis = df0[ind].values.T
dis = dis / np.linalg.norm(dis, axis = 1, keepdims=True)
print dis.shape

(22, 978)


In [12]:
with open('CD_signatures/REF_lincs_l1000.p', 'rb') as f:
    res = pickle.load(f)

cd_ref = np.vstack([item[u'chdirLm'] for item in res]) 
cd_ref /= (np.sqrt(np.sum(cd_ref **2, axis = 1)) + 1e-5)[:, np.newaxis]

with open(lincs_file, 'rb') as f:
    res_shrna = pickle.load(f)

In [13]:
lincs_res_meta = lincs_match(sgenes, res_shrna, dis, cd_ref, alpha_sig = 0.05)
lincs_hit_meta = set([g for g, item in lincs_res_meta.items() if item[0] > 0])
n1 = len(lincs_hit_meta)
n0 = len(lincs_res_meta)
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(7, 58, 0.1206896551724138, (0.036851790439477006, 0.2045275199053506))


In [14]:
lincs_res_gc = lincs_match(drivers, res_shrna, dis, cd_ref, alpha_sig = 0.05)
lincs_hit_gc = set([g for g, item in lincs_res_gc .items() if item[0] > 0])
n1 = len(lincs_hit_gc)
n0 = len(lincs_res_gc)
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(6, 41, 0.14634146341463414, (0.038152750713928124, 0.2545301761153401))


In [15]:
id2gene = sorted( drivers | sgenes)

In [16]:
mgi_genes = set([g for g in (drivers|sgenes) if g in hg2mg])
mgi_hits = (mgi_hit_gc | mgi_hit_meta)
lincs_genes =  (set(lincs_res_gc) | set(lincs_res_meta))
lincs_hits = (lincs_hit_gc | lincs_hit_meta)

In [17]:
df_res = []
for item in [drivers, sgenes, lincs_hits, lincs_genes, mgi_hits, mgi_genes]:
    vec = [int(g in item) for g in id2gene]
    df_res.append(vec)
df_res = pd.DataFrame(df_res).T

In [18]:
df_res.columns = ['genes_selected_by_GCA+BNs', 'genes_selected_by_metaxcan', 
                  'LINCS_hit', 'genes_found_in_LINCS', 'MGI_hit', 'genes_found_in_MGI']
df_res.index = id2gene

df_res.to_csv(fn_out)